aboutsummaryrefslogtreecommitdiff
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/DMA-API.txt2
-rw-r--r--Documentation/DocBook/kernel-api.tmpl5
-rw-r--r--Documentation/DocBook/mac80211.tmpl12
-rw-r--r--Documentation/RCU/checklist.txt2
-rw-r--r--Documentation/RCU/rcuref.txt16
-rw-r--r--Documentation/RCU/whatisRCU.txt2
-rw-r--r--Documentation/SELinux.txt27
-rw-r--r--Documentation/block/deadline-iosched.txt14
-rw-r--r--Documentation/cdrom/ide-cd3
-rw-r--r--Documentation/feature-removal-schedule.txt21
-rw-r--r--Documentation/filesystems/ext4.txt19
-rw-r--r--Documentation/filesystems/fiemap.txt228
-rw-r--r--Documentation/filesystems/proc.txt73
-rw-r--r--Documentation/kernel-doc-nano-HOWTO.txt4
-rw-r--r--Documentation/kernel-parameters.txt53
-rw-r--r--Documentation/networking/LICENSE.qlge46
-rw-r--r--Documentation/networking/can.txt44
-rw-r--r--Documentation/networking/multiqueue.txt54
-rw-r--r--Documentation/networking/phonet.txt175
-rw-r--r--Documentation/networking/regulatory.txt194
-rw-r--r--Documentation/networking/tproxy.txt85
-rw-r--r--Documentation/rfkill.txt32
-rw-r--r--Documentation/s390/CommonIO11
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt395
-rw-r--r--Documentation/scsi/scsi_fc_transport.txt36
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt62
-rw-r--r--Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl65
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt12
-rw-r--r--Documentation/sparc/sbus_drivers.txt309
-rw-r--r--Documentation/x86/00-INDEX4
-rw-r--r--Documentation/x86/boot.txt (renamed from Documentation/x86/i386/boot.txt)2
-rw-r--r--Documentation/x86/mtrr.txt (renamed from Documentation/mtrr.txt)4
-rw-r--r--Documentation/x86/pat.txt54
-rw-r--r--Documentation/x86/usb-legacy-support.txt (renamed from Documentation/x86/i386/usb-legacy-support.txt)0
-rw-r--r--Documentation/x86/x86_64/boot-options.txt4
-rw-r--r--Documentation/x86/zero-page.txt (renamed from Documentation/x86/i386/zero-page.txt)0
37 files changed, 1457 insertions, 614 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 1427969275a..43827780010 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -249,8 +249,6 @@ mono.txt
- how to execute Mono-based .NET binaries with the help of BINFMT_MISC.
moxa-smartio
- file with info on installing/using Moxa multiport serial driver.
-mtrr.txt
- - how to use PPro Memory Type Range Registers to increase performance.
mutex-design.txt
- info on the generic mutex subsystem.
namespaces/
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index d8b63d164e4..b8e86460046 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -337,7 +337,7 @@ With scatterlists, you use the resulting mapping like this:
int i, count = dma_map_sg(dev, sglist, nents, direction);
struct scatterlist *sg;
- for (i = 0, sg = sglist; i < count; i++, sg++) {
+ for_each_sg(sglist, sg, count, i) {
hw_address[i] = sg_dma_address(sg);
hw_len[i] = sg_dma_len(sg);
}
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index b7b1482f6e0..9d0058e788e 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -283,6 +283,7 @@ X!Earch/x86/kernel/mca_32.c
<chapter id="security">
<title>Security Framework</title>
!Isecurity/security.c
+!Esecurity/inode.c
</chapter>
<chapter id="audit">
@@ -364,6 +365,10 @@ X!Edrivers/pnp/system.c
!Eblock/blk-barrier.c
!Eblock/blk-tag.c
!Iblock/blk-tag.c
+!Eblock/blk-integrity.c
+!Iblock/blktrace.c
+!Iblock/genhd.c
+!Eblock/genhd.c
</chapter>
<chapter id="chrdev">
diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl
index b651e0a4b1c..77c3c202991 100644
--- a/Documentation/DocBook/mac80211.tmpl
+++ b/Documentation/DocBook/mac80211.tmpl
@@ -145,7 +145,6 @@ usage should require reading the full document.
this though and the recommendation to allow only a single
interface in STA mode at first!
</para>
-!Finclude/net/mac80211.h ieee80211_if_types
!Finclude/net/mac80211.h ieee80211_if_init_conf
!Finclude/net/mac80211.h ieee80211_if_conf
</chapter>
@@ -177,8 +176,7 @@ usage should require reading the full document.
<title>functions/definitions</title>
!Finclude/net/mac80211.h ieee80211_rx_status
!Finclude/net/mac80211.h mac80211_rx_flags
-!Finclude/net/mac80211.h ieee80211_tx_control
-!Finclude/net/mac80211.h ieee80211_tx_status_flags
+!Finclude/net/mac80211.h ieee80211_tx_info
!Finclude/net/mac80211.h ieee80211_rx
!Finclude/net/mac80211.h ieee80211_rx_irqsafe
!Finclude/net/mac80211.h ieee80211_tx_status
@@ -189,12 +187,11 @@ usage should require reading the full document.
!Finclude/net/mac80211.h ieee80211_ctstoself_duration
!Finclude/net/mac80211.h ieee80211_generic_frame_duration
!Finclude/net/mac80211.h ieee80211_get_hdrlen_from_skb
-!Finclude/net/mac80211.h ieee80211_get_hdrlen
+!Finclude/net/mac80211.h ieee80211_hdrlen
!Finclude/net/mac80211.h ieee80211_wake_queue
!Finclude/net/mac80211.h ieee80211_stop_queue
-!Finclude/net/mac80211.h ieee80211_start_queues
-!Finclude/net/mac80211.h ieee80211_stop_queues
!Finclude/net/mac80211.h ieee80211_wake_queues
+!Finclude/net/mac80211.h ieee80211_stop_queues
</sect1>
</chapter>
@@ -230,8 +227,7 @@ usage should require reading the full document.
<title>Multiple queues and QoS support</title>
<para>TBD</para>
!Finclude/net/mac80211.h ieee80211_tx_queue_params
-!Finclude/net/mac80211.h ieee80211_tx_queue_stats_data
-!Finclude/net/mac80211.h ieee80211_tx_queue
+!Finclude/net/mac80211.h ieee80211_tx_queue_stats
</chapter>
<chapter id="AP">
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index cf5562cbe35..6e253407b3d 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -210,7 +210,7 @@ over a rather long period of time, but improvements are always welcome!
number of updates per grace period.
9. All RCU list-traversal primitives, which include
- rcu_dereference(), list_for_each_rcu(), list_for_each_entry_rcu(),
+ rcu_dereference(), list_for_each_entry_rcu(),
list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
must be either within an RCU read-side critical section or
must be protected by appropriate update-side locks. RCU
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index 451de2ad832..4202ad09313 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -29,9 +29,9 @@ release_referenced() delete()
}
If this list/array is made lock free using RCU as in changing the
-write_lock() in add() and delete() to spin_lock and changing read_lock
-in search_and_reference to rcu_read_lock(), the atomic_get in
-search_and_reference could potentially hold reference to an element which
+write_lock() in add() and delete() to spin_lock() and changing read_lock()
+in search_and_reference() to rcu_read_lock(), the atomic_inc() in
+search_and_reference() could potentially hold reference to an element which
has already been deleted from the list/array. Use atomic_inc_not_zero()
in this scenario as follows:
@@ -40,20 +40,20 @@ add() search_and_reference()
{ {
alloc_object rcu_read_lock();
... search_for_element
- atomic_set(&el->rc, 1); if (atomic_inc_not_zero(&el->rc)) {
- write_lock(&list_lock); rcu_read_unlock();
+ atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) {
+ spin_lock(&list_lock); rcu_read_unlock();
return FAIL;
add_element }
... ...
- write_unlock(&list_lock); rcu_read_unlock();
+ spin_unlock(&list_lock); rcu_read_unlock();
} }
3. 4.
release_referenced() delete()
{ {
- ... write_lock(&list_lock);
+ ... spin_lock(&list_lock);
if (atomic_dec_and_test(&el->rc)) ...
call_rcu(&el->head, el_free); delete_element
- ... write_unlock(&list_lock);
+ ... spin_unlock(&list_lock);
} ...
if (atomic_dec_and_test(&el->rc))
call_rcu(&el->head, el_free);
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index e04d643a9f5..96170824a71 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -786,8 +786,6 @@ RCU pointer/list traversal:
list_for_each_entry_rcu
hlist_for_each_entry_rcu
- list_for_each_rcu (to be deprecated in favor of
- list_for_each_entry_rcu)
list_for_each_continue_rcu (to be deprecated in favor of new
list_for_each_entry_continue_rcu)
diff --git a/Documentation/SELinux.txt b/Documentation/SELinux.txt
new file mode 100644
index 00000000000..07eae00f331
--- /dev/null
+++ b/Documentation/SELinux.txt
@@ -0,0 +1,27 @@
+If you want to use SELinux, chances are you will want
+to use the distro-provided policies, or install the
+latest reference policy release from
+ http://oss.tresys.com/projects/refpolicy
+
+However, if you want to install a dummy policy for
+testing, you can do using 'mdp' provided under
+scripts/selinux. Note that this requires the selinux
+userspace to be installed - in particular you will
+need checkpolicy to compile a kernel, and setfiles and
+fixfiles to label the filesystem.
+
+ 1. Compile the kernel with selinux enabled.
+ 2. Type 'make' to compile mdp.
+ 3. Make sure that you are not running with
+ SELinux enabled and a real policy. If
+ you are, reboot with selinux disabled
+ before continuing.
+ 4. Run install_policy.sh:
+ cd scripts/selinux
+ sh install_policy.sh
+
+Step 4 will create a new dummy policy valid for your
+kernel, with a single selinux user, role, and type.
+It will compile the policy, will set your SELINUXTYPE to
+dummy in /etc/selinux/config, install the compiled policy
+as 'dummy', and relabel your filesystem.
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
index c23cab13c3d..72576769e0f 100644
--- a/Documentation/block/deadline-iosched.txt
+++ b/Documentation/block/deadline-iosched.txt
@@ -30,12 +30,18 @@ write_expire (in ms)
Similar to read_expire mentioned above, but for writes.
-fifo_batch
+fifo_batch (number of requests)
----------
-When a read request expires its deadline, we must move some requests from
-the sorted io scheduler list to the block device dispatch queue. fifo_batch
-controls how many requests we move.
+Requests are grouped into ``batches'' of a particular data direction (read or
+write) which are serviced in increasing sector order. To limit extra seeking,
+deadline expiries are only checked between batches. fifo_batch controls the
+maximum number of requests per batch.
+
+This parameter tunes the balance between per-request latency and aggregate
+throughput. When low latency is the primary concern, smaller is better (where
+a value of 1 yields first-come first-served behaviour). Increasing fifo_batch
+generally improves throughput, at the cost of latency variation.
writes_starved (number of dispatches)
diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd
index 91c0dcc6fa5..2c558cd6c1e 100644
--- a/Documentation/cdrom/ide-cd
+++ b/Documentation/cdrom/ide-cd
@@ -145,8 +145,7 @@ useful for reading photocds.
To play an audio CD, you should first unmount and remove any data
CDROM. Any of the CDROM player programs should then work (workman,
-workbone, cdplayer, etc.). Lacking anything else, you could use the
-cdtester program in Documentation/cdrom/sbpcd.
+workbone, cdplayer, etc.).
On a few drives, you can read digital audio directly using a program
such as cdda2wav. The only types of drive which I've heard support
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 83c88cae1ed..3d2d0c29f02 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,24 @@ be removed from this file.
---------------------------
+What: old static regulatory information and ieee80211_regdom module parameter
+When: 2.6.29
+Why: The old regulatory infrastructure has been replaced with a new one
+ which does not require statically defined regulatory domains. We do
+ not want to keep static regulatory domains in the kernel due to the
+ the dynamic nature of regulatory law and localization. We kept around
+ the old static definitions for the regulatory domains of:
+ * US
+ * JP
+ * EU
+ and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was
+ set. We also kept around the ieee80211_regdom module parameter in case
+ some applications were relying on it. Changing regulatory domains
+ can now be done instead by using nl80211, as is done with iw.
+Who: Luis R. Rodriguez <lrodriguez@atheros.com>
+
+---------------------------
+
What: dev->power.power_state
When: July 2007
Why: Broken design for runtime control over driver power states, confusing
@@ -232,6 +250,9 @@ What (Why):
- xt_mark match revision 0
(superseded by xt_mark match revision 1)
+ - xt_recent: the old ipt_recent proc dir
+ (superseded by /proc/net/xt_recent)
+
When: January 2009 or Linux 2.7.0, whichever comes first
Why: Superseded by newer revisions or modules
Who: Jan Engelhardt <jengelh@computergmbh.de>
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0d5394920a3..eb154ef36c2 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -32,9 +32,9 @@ Mailing list: linux-ext4@vger.kernel.org
you will need to merge your changes with the version from e2fsprogs
1.41.x.
- - Create a new filesystem using the ext4dev filesystem type:
+ - Create a new filesystem using the ext4 filesystem type:
- # mke2fs -t ext4dev /dev/hda1
+ # mke2fs -t ext4 /dev/hda1
Or configure an existing ext3 filesystem to support extents and set
the test_fs flag to indicate that it's ok for an in-development
@@ -47,13 +47,13 @@ Mailing list: linux-ext4@vger.kernel.org
# tune2fs -I 256 /dev/hda1
- (Note: we currently do not have tools to convert an ext4dev
+ (Note: we currently do not have tools to convert an ext4
filesystem back to ext3; so please do not do try this on production
filesystems.)
- Mounting:
- # mount -t ext4dev /dev/hda1 /wherever
+ # mount -t ext4 /dev/hda1 /wherever
- When comparing performance with other filesystems, remember that
ext3/4 by default offers higher data integrity guarantees than most.
@@ -177,6 +177,11 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
your disks are battery-backed in one way or another,
disabling barriers may safely improve performance.
+inode_readahead=n This tuning parameter controls the maximum
+ number of inode table blocks that ext4's inode
+ table readahead algorithm will pre-read into
+ the buffer cache. The default value is 32 blocks.
+
orlov (*) This enables the new Orlov block allocator. It is
enabled by default.
@@ -218,6 +223,11 @@ errors=remount-ro(*) Remount the filesystem read-only on an error.
errors=continue Keep going on a filesystem error.
errors=panic Panic and halt the machine if an error occurs.
+data_err=ignore(*) Just print an error message if an error occurs
+ in a file data buffer in ordered mode.
+data_err=abort Abort the journal if an error occurs in a file
+ data buffer in ordered mode.
+
grpid Give objects the same group ID as their creator.
bsdgroups
@@ -252,6 +262,7 @@ stripe=n Number of filesystem blocks that mballoc will try
delalloc (*) Deferring block allocation until write-out time.
nodelalloc Disable delayed allocation. Blocks are allocation
when data is copied from user to page cache.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
new file mode 100644
index 00000000000..1e3defcfe50
--- /dev/null
+++ b/Documentation/filesystems/fiemap.txt
@@ -0,0 +1,228 @@
+============
+Fiemap Ioctl
+============
+
+The fiemap ioctl is an efficient method for userspace to get file
+extent mappings. Instead of block-by-block mapping (such as bmap), fiemap
+returns a list of extents.
+
+
+Request Basics
+--------------
+
+A fiemap request is encoded within struct fiemap:
+
+struct fiemap {
+ __u64 fm_start; /* logical offset (inclusive) at
+ * which to start mapping (in) */
+ __u64 fm_length; /* logical length of mapping which
+ * userspace cares about (in) */
+ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
+ __u32 fm_mapped_extents; /* number of extents that were
+ * mapped (out) */
+ __u32 fm_extent_count; /* size of fm_extents array (in) */
+ __u32 fm_reserved;
+ struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+
+fm_start, and fm_length specify the logical range within the file
+which the process would like mappings for. Extents returned mirror
+those on disk - that is, the logical offset of the 1st returned extent
+may start before fm_start, and the range covered by the last returned
+extent may end after fm_length. All offsets and lengths are in bytes.
+
+Certain flags to modify the way in which mappings are looked up can be
+set in fm_flags. If the kernel doesn't understand some particular
+flags, it will return EBADR and the contents of fm_flags will contain
+the set of flags which caused the error. If the kernel is compatible
+with all flags passed, the contents of fm_flags will be unmodified.
+It is up to userspace to determine whether rejection of a particular
+flag is fatal to it's operation. This scheme is intended to allow the
+fiemap interface to grow in the future but without losing
+compatibility with old software.
+
+fm_extent_count specifies the number of elements in the fm_extents[] array
+that can be used to return extents. If fm_extent_count is zero, then the
+fm_extents[] array is ignored (no extents will be returned), and the
+fm_mapped_extents count will hold the number of extents needed in
+fm_extents[] to hold the file's current mapping. Note that there is
+nothing to prevent the file from changing between calls to FIEMAP.
+
+The following flags can be set in fm_flags:
+
+* FIEMAP_FLAG_SYNC
+If this flag is set, the kernel will sync the file before mapping extents.
+
+* FIEMAP_FLAG_XATTR
+If this flag is set, the extents returned will describe the inodes
+extended attribute lookup tree, instead of it's data tree.
+
+
+Extent Mapping
+--------------
+
+Extent information is returned within the embedded fm_extents array
+which userspace must allocate along with the fiemap structure. The
+number of elements in the fiemap_extents[] array should be passed via
+fm_extent_count. The number of extents mapped by kernel will be
+returned via fm_mapped_extents. If the number of fiemap_extents
+allocated is less than would be required to map the requested range,
+the maximum number of extents that can be mapped in the fm_extent[]
+array will be returned and fm_mapped_extents will be equal to
+fm_extent_count. In that case, the last extent in the array will not
+complete the requested range and will not have the FIEMAP_EXTENT_LAST
+flag set (see the next section on extent flags).
+
+Each extent is described by a single fiemap_extent structure as
+returned in fm_extents.
+
+struct fiemap_extent {
+ __u64 fe_logical; /* logical offset in bytes for the start of
+ * the extent */
+ __u64 fe_physical; /* physical offset in bytes for the start
+ * of the extent */
+ __u64 fe_length; /* length in bytes for the extent */
+ __u64 fe_reserved64[2];
+ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
+ __u32 fe_reserved[3];
+};
+
+All offsets and lengths are in bytes and mirror those on disk. It is valid
+for an extents logical offset to start before the request or it's logical
+length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is
+returned, fe_logical, fe_physical, and fe_length will be aligned to the
+block size of the file system. With the exception of extents flagged as
+FIEMAP_EXTENT_MERGED, adjacent extents will not be merged.
+
+The fe_flags field contains flags which describe the extent returned.
+A special flag, FIEMAP_EXTENT_LAST is always set on the last extent in
+the file so that the process making fiemap calls can determine when no
+more extents are available, without having to call the ioctl again.
+
+Some flags are intentionally vague and will always be set in the
+presence of other more specific flags. This way a program looking for
+a general property does not have to know all existing and future flags
+which imply that property.
+
+For example, if FIEMAP_EXTENT_DATA_INLINE or FIEMAP_EXTENT_DATA_TAIL
+are set, FIEMAP_EXTENT_NOT_ALIGNED will also be set. A program looking
+for inline or tail-packed data can key on the specific flag. Software
+which simply cares not to try operating on non-aligned extents
+however, can just key on FIEMAP_EXTENT_NOT_ALIGNED, and not have to
+worry about all present and future flags which might imply unaligned
+data. Note that the opposite is not true - it would be valid for
+FIEMAP_EXTENT_NOT_ALIGNED to appear alone.
+
+* FIEMAP_EXTENT_LAST
+This is the last extent in the file. A mapping attempt past this
+extent will return nothing.
+
+* FIEMAP_EXTENT_UNKNOWN
+The location of this extent is currently unknown. This may indicate
+the data is stored on an inaccessible volume or that no storage has
+been allocated for the file yet.
+
+* FIEMAP_EXTENT_DELALLOC
+ - This will also set FIEMAP_EXTENT_UNKNOWN.
+Delayed allocation - while there is data for this extent, it's
+physical location has not been allocated yet.
+
+* FIEMAP_EXTENT_ENCODED
+This extent does not consist of plain filesystem blocks but is
+encoded (e.g. encrypted or compressed). Reading the data in this
+extent via I/O to the block device will have undefined results.
+
+Note that it is *always* undefined to try to update the data
+in-place by writing to the indicated location without the
+assistance of the filesystem, or to access the data using the
+information returned by the FIEMAP interface while the filesystem
+is mounted. In other words, user applications may only read the
+extent data via I/O to the block device while the filesystem is
+unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is
+clear; user applications must not try reading or writing to the
+filesystem via the block device under any other circumstances.
+
+* FIEMAP_EXTENT_DATA_ENCRYPTED
+ - This will also set FIEMAP_EXTENT_ENCODED
+The data in this extent has been encrypted by the file system.
+
+* FIEMAP_EXTENT_NOT_ALIGNED
+Extent offsets and length are not guaranteed to be block aligned.
+
+* FIEMAP_EXTENT_DATA_INLINE
+ This will also set FIEMAP_EXTENT_NOT_ALIGNED
+Data is located within a meta data block.
+
+* FIEMAP_EXTENT_DATA_TAIL
+ This will also set FIEMAP_EXTENT_NOT_ALIGNED
+Data is packed into a block with data from other files.
+
+* FIEMAP_EXTENT_UNWRITTEN
+Unwritten extent - the extent is allocated but it's data has not been
+initialized. This indicates the extent's data will be all zero if read
+through the filesystem but the contents are undefined if read directly from
+the device.
+
+* FIEMAP_EXTENT_MERGED
+This will be set when a file does not support extents, i.e., it uses a block
+based addressing scheme. Since returning an extent for each block back to
+userspace would be highly inefficient, the kernel will try to merge most
+adjacent blocks into 'extents'.
+
+
+VFS -> File System Implementation
+---------------------------------
+
+File systems wishing to support fiemap must implement a ->fiemap callback on
+their inode_operations structure. The fs ->fiemap call is responsible for
+defining it's set of supported fiemap flags, and calling a helper function on
+each discovered extent:
+
+struct inode_operations {
+ ...
+
+ int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
+ u64 len);
+
+->fiemap is passed struct fiemap_extent_info which describes the
+fiemap request:
+
+struct fiemap_extent_info {
+ unsigned int fi_flags; /* Flags as passed from user */
+ unsigned int fi_extents_mapped; /* Number of mapped extents */
+ unsigned int fi_extents_max; /* Size of fiemap_extent array */
+ struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
+};
+
+It is intended that the file system should not need to access any of this
+structure directly.
+
+
+Flag checking should be done at the beginning of the ->fiemap callback via the
+fiemap_check_flags() helper:
+
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
+
+The struct fieinfo should be passed in as recieved from ioctl_fiemap(). The
+set of fiemap flags which the fs understands should be passed via fs_flags. If
+fiemap_check_flags finds invalid user flags, it will place the bad values in
+fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from
+fiemap_check_flags(), it should immediately exit, returning that error back to
+ioctl_fiemap().
+
+
+For each extent in the request range, the file system should call
+the helper function, fiemap_fill_next_extent():
+
+int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
+ u64 phys, u64 len, u32 flags, u32 dev);
+
+fiemap_fill_next_extent() will use the passed values to populate the
+next free extent in the fm_extents array. 'General' extent flags will
+automatically be set from specific flags on behalf of the calling file
+system so that the userspace API is not broken.
+
+fiemap_fill_next_extent() returns 0 on success, and 1 when the
+user-supplied fm_extents array is full. If an error is encountered
+while copying the extent to user memory, -EFAULT will be returned.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index f566ad9bcb7..d831d24d2a6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -923,45 +923,44 @@ CPUs.
The "procs_blocked" line gives the number of processes currently blocked,
waiting for I/O to complete.
+
1.9 Ext4 file system parameters
------------------------------
-Ext4 file system have one directory per partition under /proc/fs/ext4/
-# ls /proc/fs/ext4/hdc/
-group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req
-stats stream_req
-
-mb_groups:
-This file gives the details of multiblock allocator buddy cache of free blocks
-
-mb_history:
-Multiblock allocation history.
-
-stats:
-This file indicate whether the multiblock allocator should start collecting
-statistics. The statistics are shown during unmount
-
-group_prealloc:
-The multiblock allocator normalize the block allocation request to
-group_prealloc filesystem blocks if we don't have strip value set.
-The stripe value can be specified at mount time or during mke2fs.
-
-max_to_scan:
-How long multiblock allocator can look for a best extent (in found extents)
-
-min_to_scan:
-How long multiblock allocator must look for a best extent
-
-order2_req:
-Multiblock allocator use 2^N search using buddies only for requests greater
-than or equal to order2_req. The request size is specfied in file system
-blocks. A value of 2 indicate only if the requests are greater than or equal
-to 4 blocks.
-
-stream_req:
-Files smaller than stream_req are served by the stream allocator, whose
-purpose is to pack requests as close each to other as possible to
-produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
-filesystem block size will use group based preallocation.
+
+Information about mounted ext4 file systems can be found in
+/proc/fs/ext4. Each mounted filesystem will have a directory in
+/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
+/proc/fs/ext4/dm-0). The files in each per-device directory are shown
+in Table 1-10, below.
+
+Table 1-10: Files in /proc/fs/ext4/<devname>
+..............................................................................
+ File Content
+ mb_groups details of multiblock allocator buddy cache of free blocks
+ mb_history multiblock allocation history
+ stats controls whether the multiblock allocator should start
+ collecting statistics, which are shown during the unmount
+ group_prealloc the multiblock allocator will round up allocation
+ requests to a multiple of this tuning parameter if the
+ stripe size is not set in the ext4 superblock
+ max_to_scan The maximum number of extents the multiblock allocator
+ will search to find the best extent
+ min_to_scan The minimum number of extents the multiblock allocator
+ will search to find the best extent
+ order2_req Tuning parameter which controls the minimum size for
+ requests (as a power of 2) where the buddy cache is
+ used
+ stream_req Files which have fewer blocks than this tunable
+ parameter will have their blocks allocated out of a
+ block group specific preallocation pool, so that small
+ files are packed closely together. Each large file
+ will have its blocks allocated out of its own unique
+ preallocation pool.
+inode_readahead Tuning parameter which controls the maximum number of
+ inode table blocks that ext4's inode table readahead
+ algorithm will pre-read into the buffer cache
+..............................................................................
+
------------------------------------------------------------------------------
Summary
diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt
index 0bd32748a46..c6841eee959 100644
--- a/Documentation/kernel-doc-nano-HOWTO.txt
+++ b/Documentation/kernel-doc-nano-HOWTO.txt
@@ -168,10 +168,10 @@ if ($#ARGV < 0) {
mkdir $ARGV[0],0777;
$state = 0;
while (<STDIN>) {
- if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) {
+ if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
if ($state == 1) { close OUT }
$state = 1;
- $fn = "$ARGV[0]/$1.4";
+ $fn = "$ARGV[0]/$1.9";
print STDERR "Creating $fn\n";
open OUT, ">$fn" or die "can't open $fn: $!\n";
print OUT $_;
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1150444a21a..2443f5bb436 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -284,6 +284,11 @@ and is between 256 and 4096 characters. It is defined in the file
isolate - enable device isolation (each device, as far
as possible, will get its own protection
domain)
+ fullflush - enable flushing of IO/TLB entries when
+ they are unmapped. Otherwise they are
+ flushed before they will be reused, which
+ is a lot of faster
+
amd_iommu_size= [HW,X86-64]
Define the size of the aperture for the AMD IOMMU
driver. Possible values are:
@@ -463,12 +468,6 @@ and is between 256 and 4096 characters. It is defined in the file
Range: 0 - 8192
Default: 64
- disable_8254_timer
- enable_8254_timer
- [IA32/X86_64] Disable/Enable interrupt 0 timer routing
- over the 8254 in addition to over the IO-APIC. The
- kernel tries to set a sensible default.
-
hpet= [X86-32,HPET] option to control HPET usage
Format: { enable (default) | disable | force }
disable: disable HPET and use PIT instead
@@ -659,11 +658,12 @@ and is between 256 and 4096 characters. It is defined in the file
earlyprintk= [X86-32,X86-64,SH,BLACKFIN]
earlyprintk=vga
earlyprintk=serial[,ttySn[,baudrate]]
+ earlyprintk=dbgp
Append ",keep" to not disable it when the real console
takes over.
- Only vga or serial at a time, not both.
+ Only vga or serial or usb debug port at a time.
Currently only ttyS0 and ttyS1 are supported.
@@ -1020,6 +1020,10 @@ and is between 256 and 4096 characters. It is defined in the file
(only serial suported for now)
Format: <serial_device>[,baud]
+ kmac= [MIPS] korina ethernet MAC address.
+ Configure the RouterBoard 532 series on-chip
+ Ethernet adapter MAC address.
+
l2cr= [PPC]
l3cr= [PPC]
@@ -1228,6 +1232,29 @@ and is between 256 and 4096 characters. It is defined in the file
or
memmap=0x10000$0x18690000
+ memory_corruption_check=0/1 [X86]
+ Some BIOSes seem to corrupt the first 64k of
+ memory when doing things like suspend/resume.
+ Setting this option will scan the memory
+ looking for corruption. Enabling this will
+ both detect corruption and prevent the kernel
+ from using the memory being corrupted.
+ However, its intended as a diagnostic tool; if
+ repeatable BIOS-originated corruption always
+ affects the same memory, you can use memmap=
+ to prevent the kernel from using that memory.
+
+ memory_corruption_check_size=size [X86]
+ By default it checks for corruption in the low
+ 64k, making this memory unavailable for normal
+ use. Use this parameter to scan for
+ corruption in more or less memory.
+
+ memory_corruption_check_period=seconds [X86]
+ By default it checks for corruption every 60
+ seconds. Use this parameter to check at some
+ other rate. 0 disables periodic checking.
+
memtest= [KNL,X86] Enable memtest
Format: <integer>
range: 0,4 : pattern number
@@ -1425,6 +1452,12 @@ and is between 256 and 4096 characters. It is defined in the file
nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
+ nox2apic [X86-64,APIC] Do not enable x2APIC mode.
+
+ x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
+ default x2apic cluster mode on platforms
+ supporting x2apic.
+
noltlbs [PPC] Do not use large page/tlb entries for kernel
lowmem mapping on PPC40x.
@@ -1882,6 +1915,12 @@ and is between 256 and 4096 characters. It is defined in the file
shapers= [NET]
Maximal number of shapers.
+ show_msr= [x86] show boot-time MSR settings
+ Format: { <integer> }
+ Show boot-time (BIOS-initialized) MSR settings.
+ The parameter means the number of CPUs to show,
+ for example 1 means boot CPU only.
+
sim710= [SCSI,HW]
See header of drivers/scsi/sim710.c.
diff --git a/Documentation/networking/LICENSE.qlge b/Documentation/networking/LICENSE.qlge
new file mode 100644
index 00000000000..123b6edd7f1
--- /dev/null
+++ b/Documentation/networking/LICENSE.qlge
@@ -0,0 +1,46 @@
+Copyright (c) 2003-2008 QLogic Corporation
+QLogic Linux Networking HBA Driver
+
+This program includes a device driver for Linux 2.6 that may be
+distributed with QLogic hardware specific firmware binary file.
+You may modify and redistribute the device driver code under the
+GNU General Public License as published by the Free Software
+Foundation (version 2 or a later version).
+
+You may redistribute the hardware specific firmware binary file
+under the following terms:
+
+ 1. Redistribution of source code (only if applicable),
+ must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+
+ 2. Redistribution in binary form must reproduce the above
+ copyright notice, this list of conditions and the
+ following disclaimer in the documentation and/or other
+ materials provided with the distribution.
+
+ 3. The name of QLogic Corporation may not be used to
+ endorse or promote products derived from this software
+ without specific prior written permission
+
+REGARDLESS OF WHAT LICENSING MECHANISM IS USED OR APPLICABLE,
+THIS PROGRAM IS PROVIDED BY QLOGIC CORPORATION "AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+USER ACKNOWLEDGES AND AGREES THAT USE OF THIS PROGRAM WILL NOT
+CREATE OR GIVE GROUNDS FOR A LICENSE BY IMPLICATION, ESTOPPEL, OR
+OTHERWISE IN ANY INTELLECTUAL PROPERTY RIGHTS (PATENT, COPYRIGHT,
+TRADE SECRET, MASK WORK, OR OTHER PROPRIETARY RIGHT) EMBODIED IN
+ANY OTHER QLOGIC HARDWARE OR SOFTWARE EITHER SOLELY OR IN
+COMBINATION WITH THIS PROGRAM.
+
diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt
index 297ba7b1cca..2035bc4932f 100644
--- a/Documentation/networking/can.txt
+++ b/Documentation/networking/can.txt
@@ -35,8 +35,9 @@ This file contains
6.1 general settings
6.2 local loopback of sent frames
6.3 CAN controller hardware filters
- 6.4 currently supported CAN hardware
- 6.5 todo
+ 6.4 The virtual CAN driver (vcan)
+ 6.5 currently supported CAN hardware
+ 6.6 todo
7 Credits
@@ -584,7 +585,42 @@ solution for a couple of reasons:
@133MHz with four SJA1000 CAN controllers from 2002 under heavy bus
load without any problems ...
- 6.4 currently supported CAN hardware (September 2007)
+ 6.4 The virtual CAN driver (vcan)
+
+ Similar to the network loopback devices, vcan offers a virtual local
+ CAN interface. A full qualified address on CAN consists of
+
+ - a unique CAN Identifier (CAN ID)
+ - the CAN bus this CAN ID is transmitted on (e.g. can0)
+
+ so in common use cases more than one virtual CAN interface is needed.
+
+ The virtual CAN interfaces allow the transmission and reception of CAN
+ frames without real CAN controller hardware. Virtual CAN network
+ devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ...
+ When compiled as a module the virtual CAN driver module is called vcan.ko
+
+ Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel
+ netlink interface to create vcan network devices. The creation and
+ removal of vcan network devices can be managed with the ip(8) tool:
+
+ - Create a virtual CAN network interface:
+ ip link add type vcan
+
+ - Create a virtual CAN network interface with a specific name 'vcan42':
+ ip link add dev vcan42 type vcan
+
+ - Remove a (virtual CAN) network interface 'vcan42':
+ ip link del vcan42
+
+ The tool 'vcan' from the SocketCAN SVN repository on BerliOS is obsolete.
+
+ Virtual CAN network device creation in older Kernels:
+ In Linux Kernel versions < 2.6.24 the vcan driver creates 4 vcan
+ netdevices at module load time by default. This value can be changed
+ with the module parameter 'numdev'. E.g. 'modprobe vcan numdev=8'
+
+ 6.5 currently supported CAN hardware
On the project website http://developer.berlios.de/projects/socketcan
there are different drivers available:
@@ -603,7 +639,7 @@ solution for a couple of reasons:
Please check the Mailing Lists on the berlios OSS project website.
- 6.5 todo (September 2007)
+ 6.6 todo
The configuration interface for CAN network drivers is still an open
issue that has not been finalized in the socketcan project. Also the
diff --git a/Documentation/networking/multiqueue.txt b/Documentation/networking/multiqueue.txt
index d391ea63114..4caa0e314cc 100644
--- a/Documentation/networking/multiqueue.txt
+++ b/Documentation/networking/multiqueue.txt
@@ -24,4 +24,56 @@ netif_{start|stop|wake}_subqueue() functions to manage each queue while the
device is still operational. netdev->queue_lock is still used when the device
comes online or when it's completely shut down (unregister_netdev(), etc.).
-Author: Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>
+
+Section 2: Qdisc support for multiqueue devices
+
+-----------------------------------------------
+
+Currently two qdiscs are optimized for multiqueue devices. The first is the
+default pfifo_fast qdisc. This qdisc supports one qdisc per hardware queue.
+A new round-robin qdisc, sch_multiq also supports multiple hardware queues. The
+qdisc is responsible for classifying the skb's and then directing the skb's to
+bands and queues based on the value in skb->queue_mapping. Use this field in
+the base driver to determine which queue to send the skb to.
+
+sch_multiq has been added for hardware that wishes to avoid head-of-line
+blocking. It will cycle though the bands and verify that the hardware queue
+associated with the band is not stopped prior to dequeuing a packet.
+
+On qdisc load, the number of bands is based on the number of queues on the
+hardware. Once the association is made, any skb with skb->queue_mapping set,
+will be queued to the band associated with the hardware queue.
+
+
+Section 3: Brief howto using MULTIQ for multiqueue devices
+---------------------------------------------------------------
+
+The userspace command 'tc,' part of the iproute2 package, is used to configure
+qdiscs. To add the MULTIQ qdisc to your network device, assuming the device
+is called eth0, run the following command:
+
+# tc qdisc add dev eth0 root handle 1: multiq
+
+The qdisc will allocate the number of bands to equal the number of queues that
+the device reports, and bring the qdisc online. Assuming eth0 has 4 Tx
+queues, the band mapping would look like:
+
+band 0 => queue 0
+band 1 => queue 1
+band 2 => queue 2
+band 3 => queue 3
+
+Traffic will begin flowing through each queue based on either the simple_tx_hash
+function or based on netdev->select_queue() if you have it defined.
+
+The behavior of tc filters remains the same. However a new tc action,
+skbedit, has been added. Assuming you wanted to route all traffic to a
+specific host, for example 192.168.0.3, through a specific queue you could use
+this action and establish a filter such as:
+
+tc filter add dev eth0 parent 1: protocol ip prio 1 u32 \
+ match ip dst 192.168.0.3 \
+ action skbedit queue_mapping 3
+
+Author: Alexander Duyck <alexander.h.duyck@intel.com>
+Original Author: Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>
diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt
new file mode 100644
index 00000000000..0e6e592f4f5
--- /dev/null
+++ b/Documentation/networking/phonet.txt
@@ -0,0 +1,175 @@
+Linux Phonet protocol family
+============================
+
+Introduction
+------------
+
+Phonet is a packet protocol used by Nokia cellular modems for both IPC
+and RPC. With the Linux Phonet socket family, Linux host processes can
+receive and send messages from/to the modem, or any other external
+device attached to the modem. The modem takes care of routing.
+
+Phonet packets can be exchanged through various hardware connections
+depending on the device, such as:
+ - USB with the CDC Phonet interface,
+ - infrared,
+ - Bluetooth,
+ - an RS232 serial port (with a dedicated "FBUS" line discipline),
+ - the SSI bus with some TI OMAP processors.
+
+
+Packets format
+--------------
+
+Phonet packets have a common header as follows:
+
+ struct phonethdr {
+ uint8_t pn_media; /* Media type (link-layer identifier) */
+ uint8_t pn_rdev; /* Receiver device ID */
+ uint8_t pn_sdev; /* Sender device ID */
+ uint8_t pn_res; /* Resource ID or function */
+ uint16_t pn_length; /* Big-endian message byte length (minus 6) */
+ uint8_t pn_robj; /* Receiver object ID */
+ uint8_t pn_sobj; /* Sender object ID */
+ };
+
+On Linux, the link-layer header includes the pn_media byte (see below).
+The next 7 bytes are part of the network-layer header.
+
+The device ID is split: the 6 higher-order bits consitute the device
+address, while the 2 lower-order bits are used for multiplexing, as are
+the 8-bit object identifiers. As such, Phonet can be considered as a
+network layer with 6 bits of address space and 10 bits for transport
+protocol (much like port numbers in IP world).
+
+The modem always has address number zero. All other device have a their
+own 6-bit address.
+
+
+Link layer
+----------
+
+Phonet links are always point-to-point links. The link layer header
+consists of a single Phonet media type byte. It uniquely identifies the
+link through which the packet is transmitted, from the modem's
+perspective. Each Phonet network device shall prepend and set the media
+type byte as appropriate. For convenience, a common phonet_header_ops
+link-layer header operations structure is provided. It sets the
+media type according to the network device hardware address.
+
+Linux Phonet network interfaces support a dedicated link layer packets
+type (ETH_P_PHONET) which is out of the Ethernet type range. They can
+only send and receive Phonet packets.
+
+The virtual TUN tunnel device driver can also be used for Phonet. This
+requires IFF_TUN mode, _without_ the IFF_NO_PI flag. In this case,
+there is no link-layer header, so there is no Phonet media type byte.
+
+Note that Phonet interfaces are not allowed to re-order packets, so
+only the (default) Linux FIFO qdisc should be used with them.
+
+
+Network layer
+-------------
+
+The Phonet socket address family maps the Phonet packet header:
+
+ struct sockaddr_pn {
+ sa_family_t spn_family; /* AF_PHONET */
+ uint8_t spn_obj; /* Object ID */
+ uint8_t spn_dev; /* Device ID */
+ uint8_t spn_resource; /* Resource or function */
+ uint8_t spn_zero[...]; /* Padding */
+ };
+
+The resource field is only used when sending and receiving;
+It is ignored by bind() and getsockname().
+
+
+Low-level datagram protocol
+---------------------------
+
+Applications can send Phonet messages using the Phonet datagram socket
+protocol from the PF_PHONET family. Each socket is bound to one of the
+2^10 object IDs available, and can send and receive packets with any
+other peer.
+
+ struct sockaddr_pn addr = { .spn_family = AF_PHONET, };
+ ssize_t len;
+ socklen_t addrlen = sizeof(addr);
+ int fd;
+
+ fd = socket(PF_PHONET, SOCK_DGRAM, 0);
+ bind(fd, (struct sockaddr *)&addr, sizeof(addr));
+ /* ... */
+
+ sendto(fd, msg, msglen, 0, (struct sockaddr *)&addr, sizeof(addr));
+ len = recvfrom(fd, buf, sizeof(buf), 0,
+ (struct sockaddr *)&addr, &addrlen);
+
+This protocol follows the SOCK_DGRAM connection-less semantics.
+However, connect() and getpeername() are not supported, as they did
+not seem useful with Phonet usages (could be added easily).
+
+
+Phonet Pipe protocol
+--------------------
+
+The Phonet Pipe protocol is a simple sequenced packets protocol
+with end-to-end congestion control. It uses the passive listening
+socket paradigm. The listening socket is bound to an unique free object
+ID. Each listening socket can handle up to 255 simultaneous
+connections, one per accept()'d socket.
+
+ int lfd, cfd;
+
+ lfd = socket(PF_PHONET, SOCK_SEQPACKET, PN_PROTO_PIPE);
+ listen (lfd, INT_MAX);
+
+ /* ... */
+ cfd = accept(lfd, NULL, NULL);
+ for (;;)
+ {
+ char buf[...];
+ ssize_t len = read(cfd, buf, sizeof(buf));
+
+ /* ... */
+
+ write(cfd, msg, msglen);
+ }
+
+Connections are established between two endpoints by a "third party"
+application. This means that both endpoints are passive; so connect()
+is not possible.
+
+WARNING:
+When polling a connected pipe socket for writability, there is an
+intrinsic race condition whereby writability might be lost between the
+polling and the writing system calls. In this case, the socket will
+block until write because possible again, unless non-blocking mode
+becomes enabled.
+
+
+The pipe protocol provides two socket options at the SOL_PNPIPE level:
+
+ PNPIPE_ENCAP accepts one integer value (int) of:
+
+ PNPIPE_ENCAP_NONE: The socket operates normally (default).
+
+ PNPIPE_ENCAP_IP: The socket is used as a backend for a virtual IP
+ interface. This requires CAP_NET_ADMIN capability. GPRS data
+ support on Nokia modems can use this. Note that the socket cannot
+ be reliably poll()'d or read() from while in this mode.
+
+ PNPIPE_IFINDEX is a read-only integer value. It contains the
+ interface index of the network interface created by PNPIPE_ENCAP,
+ or zero if encapsulation is off.
+
+
+Authors
+-------
+
+Linux Phonet was initially written by Sakari Ailus.
+Other contributors include Mikä Liljeberg, Andras Domokos,
+Carlos Chinea and Rémi Denis-Courmont.
+Copyright (C) 2008 Nokia Corporation.
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
new file mode 100644
index 00000000000..a96989a8ff3
--- /dev/null
+++ b/Documentation/networking/regulatory.txt
@@ -0,0 +1,194 @@
+Linux wireless regulatory documentation
+---------------------------------------
+
+This document gives a brief review over how the Linux wireless
+regulatory infrastructure works.
+
+More up to date information can be obtained at the project's web page:
+
+http://wireless.kernel.org/en/developers/Regulatory
+
+Keeping regulatory domains in userspace
+---------------------------------------
+
+Due to the dynamic nature of regulatory domains we keep them
+in userspace and provide a framework for userspace to upload
+to the kernel one regulatory domain to be used as the central
+core regulatory domain all wireless devices should adhere to.
+
+How to get regulatory domains to the kernel
+-------------------------------------------
+
+Userspace gets a regulatory domain in the kernel by having
+a userspace agent build it and send it via nl80211. Only
+expected regulatory domains will be respected by the kernel.
+
+A currently available userspace agent which can accomplish this
+is CRDA - central regulatory domain agent. Its documented here:
+
+http://wireless.kernel.org/en/developers/Regulatory/CRDA
+
+Essentially the kernel will send a udev event when it knows
+it needs a new regulatory domain. A udev rule can be put in place
+to trigger crda to send the respective regulatory domain for a
+specific ISO/IEC 3166 alpha2.
+
+Below is an example udev rule which can be used:
+
+# Example file, should be put in /etc/udev/rules.d/regulatory.rules
+KERNEL=="regulatory*", ACTION=="change", SUBSYSTEM=="platform", RUN+="/sbin/crda"
+
+The alpha2 is passed as an environment variable under the variable COUNTRY.
+
+Who asks for regulatory domains?
+--------------------------------
+
+* Users
+
+Users can use iw:
+
+http://wireless.kernel.org/en/users/Documentation/iw
+
+An example:
+
+ # set regulatory domain to "Costa Rica"
+ iw reg set CR
+
+This will request the kernel to set the regulatory domain to
+the specificied alpha2. The kernel in turn will then ask userspace
+to provide a regulatory domain for the alpha2 specified by the user
+by sending a uevent.
+
+* Wireless subsystems for Country Information elements
+
+The kernel will send a uevent to inform userspace a new
+regulatory domain is required. More on this to be added
+as its integration is added.
+
+* Drivers
+
+If drivers determine they need a specific regulatory domain
+set they can inform the wireless core using regulatory_hint().
+They have two options -- they either provide an alpha2 so that
+crda can provide back a regulatory domain for that country or
+they can build their own regulatory domain based on internal
+custom knowledge so the wireless core can respect it.
+
+*Most* drivers will rely on the first mechanism of providing a
+regulatory hint with an alpha2. For these drivers there is an additional
+check that can be used to ensure compliance based on custom EEPROM
+regulatory data. This additional check can be used by drivers by
+registering on its struct wiphy a reg_notifier() callback. This notifier
+is called when the core's regulatory domain has been changed. The driver
+can use this to review the changes made and also review who made them
+(driver, user, country IE) and determine what to allow based on its
+internal EEPROM data. Devices drivers wishing to be capable of world
+roaming should use this callback. More on world roaming will be
+added to this document when its support is enabled.
+
+Device drivers who provide their own built regulatory domain
+do not need a callback as the channels registered by them are
+the only ones that will be allowed and therefore *additional*
+cannels cannot be enabled.
+
+Example code - drivers hinting an alpha2:
+------------------------------------------
+
+This example comes from the zd1211rw device driver. You can start
+by having a mapping of your device's EEPROM country/regulatory
+domain value to to a specific alpha2 as follows:
+
+static struct zd_reg_alpha2_map reg_alpha2_map[] = {
+ { ZD_REGDOMAIN_FCC, "US" },
+ { ZD_REGDOMAIN_IC, "CA" },
+ { ZD_REGDOMAIN_ETSI, "DE" }, /* Generic ETSI, use most restrictive */
+ { ZD_REGDOMAIN_JAPAN, "JP" },
+ { ZD_REGDOMAIN_JAPAN_ADD, "JP" },
+ { ZD_REGDOMAIN_SPAIN, "ES" },
+ { ZD_REGDOMAIN_FRANCE, "FR" },
+
+Then you can define a routine to map your read EEPROM value to an alpha2,
+as follows:
+
+static int zd_reg2alpha2(u8 regdomain, char *alpha2)
+{
+ unsigned int i;
+ struct zd_reg_alpha2_map *reg_map;
+ for (i = 0; i < ARRAY_SIZE(reg_alpha2_map); i++) {
+ reg_map = &reg_alpha2_map[i];
+ if (regdomain == reg_map->reg) {
+ alpha2[0] = reg_map->alpha2[0];
+ alpha2[1] = reg_map->alpha2[1];
+ return 0;
+ }
+ }
+ return 1;
+}
+
+Lastly, you can then hint to the core of your discovered alpha2, if a match
+was found. You need to do this after you have registered your wiphy. You
+are expected to do this during initialization.
+
+ r = zd_reg2alpha2(mac->regdomain, alpha2);
+ if (!r)
+ regulatory_hint(hw->wiphy, alpha2, NULL);
+
+Example code - drivers providing a built in regulatory domain:
+--------------------------------------------------------------
+
+If you have regulatory information you can obtain from your
+driver and you *need* to use this we let you build a regulatory domain
+structure and pass it to the wireless core. To do this you should
+kmalloc() a structure big enough to hold your regulatory domain
+structure and you should then fill it with your data. Finally you simply
+call regulatory_hint() with the regulatory domain structure in it.
+
+Bellow is a simple example, with a regulatory domain cached using the stack.
+Your implementation may vary (read EEPROM cache instead, for example).
+
+Example cache of some regulatory domain
+
+struct ieee80211_regdomain mydriver_jp_regdom = {
+ .n_reg_rules = 3,
+ .alpha2 = "JP",
+ //.alpha2 = "99", /* If I have no alpha2 to map it to */
+ .reg_rules = {
+ /* IEEE 802.11b/g, channels 1..14 */
+ REG_RULE(2412-20, 2484+20, 40, 6, 20, 0),
+ /* IEEE 802.11a, channels 34..48 */
+ REG_RULE(5170-20, 5240+20, 40, 6, 20,
+ NL80211_RRF_PASSIVE_SCAN),
+ /* IEEE 802.11a, channels 52..64 */
+ REG_RULE(5260-20, 5320+20, 40, 6, 20,
+ NL80211_RRF_NO_IBSS |
+ NL80211_RRF_DFS),
+ }
+};
+
+Then in some part of your code after your wiphy has been registered:
+
+ int r;
+ struct ieee80211_regdomain *rd;
+ int size_of_regd;
+ int num_rules = mydriver_jp_regdom.n_reg_rules;
+ unsigned int i;
+
+ size_of_regd = sizeof(struct ieee80211_regdomain) +
+ (num_rules * sizeof(struct ieee80211_reg_rule));
+
+ rd = kzalloc(size_of_regd, GFP_KERNEL);
+ if (!rd)
+ return -ENOMEM;
+
+ memcpy(rd, &mydriver_jp_regdom, sizeof(struct ieee80211_regdomain));
+
+ for (i=0; i < num_rules; i++) {
+ memcpy(&rd->reg_rules[i], &mydriver_jp_regdom.reg_rules[i],
+ sizeof(struct ieee80211_reg_rule));
+ }
+ r = regulatory_hint(hw->wiphy, NULL, rd);
+ if (r) {
+ kfree(rd);
+ return r;
+ }
+
diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt
new file mode 100644
index 00000000000..7b5996d9357
--- /dev/null
+++ b/Documentation/networking/tproxy.txt
@@ -0,0 +1,85 @@
+Transparent proxy support
+=========================
+
+This feature adds Linux 2.2-like transparent proxy support to current kernels.
+To use it, enable NETFILTER_TPROXY, the socket match and the TPROXY target in
+your kernel config. You will need policy routing too, so be sure to enable that
+as well.
+
+
+1. Making non-local sockets work
+================================
+
+The idea is that you identify packets with destination address matching a local
+socket on your box, set the packet mark to a certain value, and then match on that
+value using policy routing to have those packets delivered locally:
+
+# iptables -t mangle -N DIVERT
+# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
+# iptables -t mangle -A DIVERT -j MARK --set-mark 1
+# iptables -t mangle -A DIVERT -j ACCEPT
+
+# ip rule add fwmark 1 lookup 100
+# ip route add local 0.0.0.0/0 dev lo table 100
+
+Because of certain restrictions in the IPv4 routing output code you'll have to
+modify your application to allow it to send datagrams _from_ non-local IP
+addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket
+option before calling bind:
+
+fd = socket(AF_INET, SOCK_STREAM, 0);
+/* - 8< -*/
+int value = 1;
+setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value));
+/* - 8< -*/
+name.sin_family = AF_INET;
+name.sin_port = htons(0xCAFE);
+name.sin_addr.s_addr = htonl(0xDEADBEEF);
+bind(fd, &name, sizeof(name));
+
+A trivial patch for netcat is available here:
+http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch
+
+
+2. Redirecting traffic
+======================
+
+Transparent proxying often involves "intercepting" traffic on a router. This is
+usually done with the iptables REDIRECT target; however, there are serious
+limitations of that method. One of the major issues is that it actually
+modifies the packets to change the destination address -- which might not be
+acceptable in certain situations. (Think of proxying UDP for example: you won't
+be able to find out the original destination address. Even in case of TCP
+getting the original destination address is racy.)
+
+The 'TPROXY' target provides similar functionality without relying on NAT. Simply
+add rules like this to the iptables ruleset above:
+
+# iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \
+ --tproxy-mark 0x1/0x1 --on-port 50080
+
+Note that for this to work you'll have to modify the proxy to enable (SOL_IP,
+IP_TRANSPARENT) for the listening socket.
+
+
+3. Iptables extensions
+======================
+
+To use tproxy you'll need to have the 'socket' and 'TPROXY' modules
+compiled for iptables. A patched version of iptables is available
+here: http://git.balabit.hu/?p=bazsi/iptables-tproxy.git
+
+
+4. Application support
+======================
+
+4.1. Squid
+----------
+
+Squid 3.HEAD has support built-in. To use it, pass
+'--enable-linux-netfilter' to configure and set the 'tproxy' option on
+the HTTP listener you redirect traffic to with the TPROXY iptables
+target.
+
+For more information please consult the following page on the Squid
+wiki: http://wiki.squid-cache.org/Features/Tproxy4
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index 6fcb3060dec..b65f0799df4 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -341,6 +341,8 @@ key that does nothing by itself, as well as any hot key that is type-specific
3.1 Guidelines for wireless device drivers
------------------------------------------
+(in this text, rfkill->foo means the foo field of struct rfkill).
+
1. Each independent transmitter in a wireless device (usually there is only one
transmitter per device) should have a SINGLE rfkill class attached to it.
@@ -363,10 +365,32 @@ This rule exists because users of the rfkill subsystem expect to get (and set,
when possible) the overall transmitter rfkill state, not of a particular rfkill
line.
-5. During suspend, the rfkill class will attempt to soft-block the radio
-through a call to rfkill->toggle_radio, and will try to restore its previous
-state during resume. After a rfkill class is suspended, it will *not* call
-rfkill->toggle_radio until it is resumed.
+5. The wireless device driver MUST NOT leave the transmitter enabled during
+suspend and hibernation unless:
+
+ 5.1. The transmitter has to be enabled for some sort of functionality
+ like wake-on-wireless-packet or autonomous packed forwarding in a mesh
+ network, and that functionality is enabled for this suspend/hibernation
+ cycle.
+
+AND
+
+ 5.2. The device was not on a user-requested BLOCKED state before
+ the suspend (i.e. the driver must NOT unblock a device, not even
+ to support wake-on-wireless-packet or remain in the mesh).
+
+In other words, there is absolutely no allowed scenario where a driver can
+automatically take action to unblock a rfkill controller (obviously, this deals
+with scenarios where soft-blocking or both soft and hard blocking is happening.
+Scenarios where hardware rfkill lines are the only ones blocking the
+transmitter are outside of this rule, since the wireless device driver does not
+control its input hardware rfkill lines in the first place).
+
+6. During resume, rfkill will try to restore its previous state.
+
+7. After a rfkill class is suspended, it will *not* call rfkill->toggle_radio
+until it is resumed.
+
Example of a WLAN wireless driver connected to the rfkill subsystem:
--------------------------------------------------------------------
diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO
index bf0baa19ec2..339207d11d9 100644
--- a/Documentation/s390/CommonIO
+++ b/Documentation/s390/CommonIO
@@ -70,13 +70,19 @@ Command line parameters
Note: While already known devices can be added to the list of devices to be
ignored, there will be no effect on then. However, if such a device
- disappears and then reappears, it will then be ignored.
+ disappears and then reappears, it will then be ignored. To make
+ known devices go away, you need the "purge" command (see below).
For example,
"echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore"
will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored
devices.
+ You can remove already known but now ignored devices via
+ "echo purge > /proc/cio_ignore"
+ All devices ignored but still registered and not online (= not in use)
+ will be deregistered and thus removed from the system.
+
The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward
compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
numbers given as 0xabcd will be interpreted as 0.0.abcd.
@@ -98,8 +104,7 @@ debugfs entries
handling).
- /sys/kernel/debug/s390dbf/cio_msg/sprintf
- Various debug messages from the common I/O-layer, including messages
- printed when cio_msg=yes.
+ Various debug messages from the common I/O-layer.
- /sys/kernel/debug/s390dbf/cio_trace/hex_ascii
Logs the calling of functions in the common I/O-layer and, if applicable,
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 88bcb876733..9d8eb553884 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -1,151 +1,242 @@
+ =============
+ CFS Scheduler
+ =============
-This is the CFS scheduler.
-
-80% of CFS's design can be summed up in a single sentence: CFS basically
-models an "ideal, precise multi-tasking CPU" on real hardware.
-
-"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100%
-physical power and which can run each task at precise equal speed, in
-parallel, each at 1/nr_running speed. For example: if there are 2 tasks
-running then it runs each at 50% physical power - totally in parallel.
-
-On real hardware, we can run only a single task at once, so while that
-one task runs, the other tasks that are waiting for the CPU are at a
-disadvantage - the current task gets an unfair amount of CPU time. In
-CFS this fairness imbalance is expressed and tracked via the per-task
-p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
-time the task should now run on the CPU for it to become completely fair
-and balanced.
-
-( small detail: on 'ideal' hardware, the p->wait_runtime value would
- always be zero - no task would ever get 'out of balance' from the
- 'ideal' share of CPU time. )
-
-CFS's task picking logic is based on this p->wait_runtime value and it
-is thus very simple: it always tries to run the task with the largest
-p->wait_runtime value. In other words, CFS tries to run the task with
-the 'gravest need' for more CPU time. So CFS always tries to split up
-CPU time between runnable tasks as close to 'ideal multitasking
-hardware' as possible.
-
-Most of the rest of CFS's design just falls out of this really simple
-concept, with a few add-on embellishments like nice levels,
-multiprocessing and various algorithm variants to recognize sleepers.
-
-In practice it works like this: the system runs a task a bit, and when
-the task schedules (or a scheduler tick happens) the task's CPU usage is
-'accounted for': the (small) time it just spent using the physical CPU
-is deducted from p->wait_runtime. [minus the 'fair share' it would have
-gotten anyway]. Once p->wait_runtime gets low enough so that another
-task becomes the 'leftmost task' of the time-ordered rbtree it maintains
-(plus a small amount of 'granularity' distance relative to the leftmost
-task so that we do not over-schedule tasks and trash the cache) then the
-new leftmost task is picked and the current task is preempted.
-
-The rq->fair_clock value tracks the 'CPU time a runnable task would have
-fairly gotten, had it been runnable during that time'. So by using
-rq->fair_clock values we can accurately timestamp and measure the
-'expected CPU time' a task should have gotten. All runnable tasks are
-sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
-CFS picks the 'leftmost' task and sticks to it. As the system progresses
-forwards, newly woken tasks are put into the tree more and more to the
-right - slowly but surely giving a chance for every task to become the
-'leftmost task' and thus get on the CPU within a deterministic amount of
-time.
-
-Some implementation details:
-
- - the introduction of Scheduling Classes: an extensible hierarchy of
- scheduler modules. These modules encapsulate scheduling policy
- details and are handled by the scheduler core without the core
- code assuming about them too much.
-
- - sched_fair.c implements the 'CFS desktop scheduler': it is a
- replacement for the vanilla scheduler's SCHED_OTHER interactivity
- code.
-
- I'd like to give credit to Con Kolivas for the general approach here:
- he has proven via RSDL/SD that 'fair scheduling' is possible and that
- it results in better desktop scheduling. Kudos Con!
-
- The CFS patch uses a completely different approach and implementation
- from RSDL/SD. My goal was to make CFS's interactivity quality exceed
- that of RSDL/SD, which is a high standard to meet :-) Testing
- feedback is welcome to decide this one way or another. [ and, in any
- case, all of SD's logic could be added via a kernel/sched_sd.c module
- as well, if Con is interested in such an approach. ]
-
- CFS's design is quite radical: it does not use runqueues, it uses a
- time-ordered rbtree to build a 'timeline' of future task execution,
- and thus has no 'array switch' artifacts (by which both the vanilla
- scheduler and RSDL/SD are affected).
-
- CFS uses nanosecond granularity accounting and does not rely on any
- jiffies or other HZ detail. Thus the CFS scheduler has no notion of
- 'timeslices' and has no heuristics whatsoever. There is only one
- central tunable (you have to switch on CONFIG_SCHED_DEBUG):
-
- /proc/sys/kernel/sched_granularity_ns
-
- which can be used to tune the scheduler from 'desktop' (low
- latencies) to 'server' (good batching) workloads. It defaults to a
- setting suitable for desktop workloads. SCHED_BATCH is handled by the
- CFS scheduler module too.
-
- Due to its design, the CFS scheduler is not prone to any of the
- 'attacks' that exist today against the heuristics of the stock
- scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
- work fine and do not impact interactivity and produce the expected
- behavior.
-
- the CFS scheduler has a much stronger handling of nice levels and
- SCHED_BATCH: both types of workloads should be isolated much more
- agressively than under the vanilla scheduler.
-
- ( another detail: due to nanosec accounting and timeline sorting,
- sched_yield() support is very simple under CFS, and in fact under
- CFS sched_yield() behaves much better than under any other
- scheduler i have tested so far. )
-
- - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
- way than the vanilla scheduler does. It uses 100 runqueues (for all
- 100 RT priority levels, instead of 140 in the vanilla scheduler)
- and it needs no expired array.
-
- - reworked/sanitized SMP load-balancing: the runqueue-walking
- assumptions are gone from the load-balancing code now, and
- iterators of the scheduling modules are used. The balancing code got
- quite a bit simpler as a result.
-
-
-Group scheduler extension to CFS
-================================
-
-Normally the scheduler operates on individual tasks and strives to provide
-fair CPU time to each task. Sometimes, it may be desirable to group tasks
-and provide fair CPU time to each such task group. For example, it may
-be desirable to first provide fair CPU time to each user on the system
-and then to each task belonging to a user.
-
-CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
-SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
-groups. At present, there are two (mutually exclusive) mechanisms to group
-tasks for CPU bandwidth control purpose:
-
- - Based on user id (CONFIG_FAIR_USER_SCHED)
- In this option, tasks are grouped according to their user id.
- - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
- This options lets the administrator create arbitrary groups
- of tasks, using the "cgroup" pseudo filesystem. See
- Documentation/cgroups.txt for more information about this
- filesystem.
-Only one of these options to group tasks can be chosen and not both.
+1. OVERVIEW
+
+CFS stands for "Completely Fair Scheduler," and is the new "desktop" process
+scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the
+replacement for the previous vanilla scheduler's SCHED_OTHER interactivity
+code.
+
+80% of CFS's design can be summed up in a single sentence: CFS basically models
+an "ideal, precise multi-tasking CPU" on real hardware.
+
+"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical
+power and which can run each task at precise equal speed, in parallel, each at
+1/nr_running speed. For example: if there are 2 tasks running, then it runs
+each at 50% physical power --- i.e., actually in parallel.
+
+On real hardware, we can run only a single task at once, so we have to
+introduce the concept of "virtual runtime." The virtual runtime of a task
+specifies when its next timeslice would start execution on the ideal
+multi-tasking CPU described above. In practice, the virtual runtime of a task
+is its actual runtime normalized to the total number of running tasks.
+
+
+
+2. FEW IMPLEMENTATION DETAILS
+
+In CFS the virtual runtime is expressed and tracked via the per-task
+p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately
+timestamp and measure the "expected CPU time" a task should have gotten.
+
+[ small detail: on "ideal" hardware, at any time all tasks would have the same
+ p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
+ would ever get "out of balance" from the "ideal" share of CPU time. ]
+
+CFS's task picking logic is based on this p->se.vruntime value and it is thus
+very simple: it always tries to run the task with the smallest p->se.vruntime
+value (i.e., the task which executed least so far). CFS always tries to split
+up CPU time between runnable tasks as close to "ideal multitasking hardware" as
+possible.
+
+Most of the rest of CFS's design just falls out of this really simple concept,
+with a few add-on embellishments like nice levels, multiprocessing and various
+algorithm variants to recognize sleepers.
+
+
+
+3. THE RBTREE
+
+CFS's design is quite radical: it does not use the old data structures for the
+runqueues, but it uses a time-ordered rbtree to build a "timeline" of future
+task execution, and thus has no "array switch" artifacts (by which both the
+previous vanilla scheduler and RSDL/SD are affected).
+
+CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic
+increasing value tracking the smallest vruntime among all tasks in the
+runqueue. The total amount of work done by the system is tracked using
+min_vruntime; that value is used to place newly activated entities on the left
+side of the tree as much as possible.
+
+The total number of running tasks in the runqueue is accounted through the
+rq->cfs.load value, which is the sum of the weights of the tasks queued on the
+runqueue.
+
+CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
+p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to
+account for possible wraparounds). CFS picks the "leftmost" task from this
+tree and sticks to it.
+As the system progresses forwards, the executed tasks are put into the tree
+more and more to the right --- slowly but surely giving a chance for every task
+to become the "leftmost task" and thus get on the CPU within a deterministic
+amount of time.
+
+Summing up, CFS works like this: it runs a task a bit, and when the task
+schedules (or a scheduler tick happens) the task's CPU usage is "accounted
+for": the (small) time it just spent using the physical CPU is added to
+p->se.vruntime. Once p->se.vruntime gets high enough so that another task
+becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a
+small amount of "granularity" distance relative to the leftmost task so that we
+do not over-schedule tasks and trash the cache), then the new leftmost task is
+picked and the current task is preempted.
+
+
+
+4. SOME FEATURES OF CFS
+
+CFS uses nanosecond granularity accounting and does not rely on any jiffies or
+other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
+way the previous scheduler had, and has no heuristics whatsoever. There is
+only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+
+ /proc/sys/kernel/sched_granularity_ns
+
+which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
+"server" (i.e., good batching) workloads. It defaults to a setting suitable
+for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too.
+
+Due to its design, the CFS scheduler is not prone to any of the "attacks" that
+exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c,
+chew.c, ring-test.c, massive_intr.c all work fine and do not impact
+interactivity and produce the expected behavior.
+
+The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH
+than the previous vanilla scheduler: both types of workloads are isolated much
+more aggressively.
+
+SMP load-balancing has been reworked/sanitized: the runqueue-walking
+assumptions are gone from the load-balancing code now, and iterators of the
+scheduling modules are used. The balancing code got quite a bit simpler as a
+result.
+
+
+
+5. Scheduling policies
+
+CFS implements three scheduling policies:
+
+ - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
+ policy that is used for regular tasks.
+
+ - SCHED_BATCH: Does not preempt nearly as often as regular tasks
+ would, thereby allowing tasks to run longer and make better use of
+ caches but at the cost of interactivity. This is well suited for
+ batch jobs.
+
+ - SCHED_IDLE: This is even weaker than nice 19, but its not a true
+ idle timer scheduler in order to avoid to get into priority
+ inversion problems which would deadlock the machine.
+
+SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by
+POSIX.
+
+The command chrt from util-linux-ng 2.13.1.1 can set all of these except
+SCHED_IDLE.
-Group scheduler tunables:
-When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
-each new user and a "cpu_share" file is added in that directory.
+
+6. SCHEDULING CLASSES
+
+The new CFS scheduler has been designed in such a way to introduce "Scheduling
+Classes," an extensible hierarchy of scheduler modules. These modules
+encapsulate scheduling policy details and are handled by the scheduler core
+without the core code assuming too much about them.
+
+sched_fair.c implements the CFS scheduler described above.
+
+sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
+the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT
+priority levels, instead of 140 in the previous scheduler) and it needs no
+expired array.
+
+Scheduling classes are implemented through the sched_class structure, which
+contains hooks to functions that must be called whenever an interesting event
+occurs.
+
+This is the (partial) list of the hooks:
+
+ - enqueue_task(...)
+
+ Called when a task enters a runnable state.
+ It puts the scheduling entity (task) into the red-black tree and
+ increments the nr_running variable.
+
+ - dequeue_tree(...)
+
+ When a task is no longer runnable, this function is called to keep the
+ corresponding scheduling entity out of the red-black tree. It decrements
+ the nr_running variable.
+
+ - yield_task(...)
+
+ This function is basically just a dequeue followed by an enqueue, unless the
+ compat_yield sysctl is turned on; in that case, it places the scheduling
+ entity at the right-most end of the red-black tree.
+
+ - check_preempt_curr(...)
+
+ This function checks if a task that entered the runnable state should
+ preempt the currently running task.
+
+ - pick_next_task(...)
+
+ This function chooses the most appropriate task eligible to run next.
+
+ - set_curr_task(...)
+
+ This function is called when a task changes its scheduling class or changes
+ its task group.
+
+ - task_tick(...)
+
+ This function is mostly called from time tick functions; it might lead to
+ process switch. This drives the running preemption.
+
+ - task_new(...)
+
+ The core scheduler gives the scheduling module an opportunity to manage new
+ task startup. The CFS scheduling module uses it for group scheduling, while
+ the scheduling module for a real-time task does not use it.
+
+
+
+7. GROUP SCHEDULER EXTENSIONS TO CFS
+
+Normally, the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks and
+provide fair CPU time to each such task group. For example, it may be
+desirable to first provide fair CPU time to each user on the system and then to
+each task belonging to a user.
+
+CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be
+grouped and divides CPU time fairly among such groups.
+
+CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
+SCHED_RR) tasks.
+
+CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
+SCHED_BATCH) tasks.
+
+At present, there are two (mutually exclusive) mechanisms to group tasks for
+CPU bandwidth control purposes:
+
+ - Based on user id (CONFIG_USER_SCHED)
+
+ With this option, tasks are grouped according to their user id.
+
+ - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
+
+ This options needs CONFIG_CGROUPS to be defined, and lets the administrator
+ create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
+user and a "cpu_share" file is added in that directory.
# cd /sys/kernel/uids
# cat 512/cpu_share # Display user 512's CPU share
@@ -155,16 +246,14 @@ each new user and a "cpu_share" file is added in that directory.
2048
#
-CPU bandwidth between two users are divided in the ratio of their CPU shares.
-For ex: if you would like user "root" to get twice the bandwidth of user
-"guest", then set the cpu_share for both the users such that "root"'s
-cpu_share is twice "guest"'s cpu_share
-
+CPU bandwidth between two users is divided in the ratio of their CPU shares.
+For example: if you would like user "root" to get twice the bandwidth of user
+"guest," then set the cpu_share for both the users such that "root"'s cpu_share
+is twice "guest"'s cpu_share.
-When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
-for each group created using the pseudo filesystem. See example steps
-below to create task groups and modify their CPU share using the "cgroups"
-pseudo filesystem
+When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
+group created using the pseudo filesystem. See example steps below to create
+task groups and modify their CPU share using the "cgroups" pseudo filesystem.
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu none /dev/cpuctl
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index 75143f0c23b..38d324d62b2 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -436,6 +436,42 @@ Other:
was updated to remove all vports for the fc_host as well.
+Transport supplied functions
+----------------------------
+
+The following functions are supplied by the FC-transport for use by LLDs.
+
+ fc_vport_create - create a vport
+ fc_vport_terminate - detach and remove a vport
+
+Details:
+
+/**
+ * fc_vport_create - Admin App or LLDD requests creation of a vport
+ * @shost: scsi host the virtual port is connected to.
+ * @ids: The world wide names, FC4 port roles, etc for
+ * the virtual port.
+ *
+ * Notes:
+ * This routine assumes no locks are held on entry.
+ */
+struct fc_vport *
+fc_vport_create(struct Scsi_Host *shost, struct fc_vport_identifiers *ids)
+
+/**
+ * fc_vport_terminate - Admin App or LLDD requests termination of a vport
+ * @vport: fc_vport to be terminated
+ *
+ * Calls the LLDD vport_delete() function, then deallocates and removes
+ * the vport from the shost and object tree.
+ *
+ * Notes:
+ * This routine assumes no locks are held on entry.
+ */
+int
+fc_vport_terminate(struct fc_vport *vport)
+
+
Credits
=======
The following people have contributed to this document:
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index b117e42a616..e0e54a27fc1 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -746,8 +746,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
Module snd-hda-intel
--------------------
- Module for Intel HD Audio (ICH6, ICH6M, ESB2, ICH7, ICH8),
- ATI SB450, SB600, RS600,
+ Module for Intel HD Audio (ICH6, ICH6M, ESB2, ICH7, ICH8, ICH9, ICH10,
+ PCH, SCH),
+ ATI SB450, SB600, R600, RS600, RS690, RS780, RV610, RV620,
+ RV630, RV635, RV670, RV770,
VIA VT8251/VT8237A,
SIS966, ULI M5461
@@ -807,6 +809,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
ALC260
hp HP machines
hp-3013 HP machines (3013-variant)
+ hp-dc7600 HP DC7600
fujitsu Fujitsu S7020
acer Acer TravelMate
will Will laptops (PB V7900)
@@ -828,8 +831,11 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
hippo Hippo (ATI) with jack detection, Sony UX-90s
hippo_1 Hippo (Benq) with jack detection
sony-assamd Sony ASSAMD
+ toshiba-s06 Toshiba S06
+ toshiba-rx1 Toshiba RX1
ultra Samsung Q1 Ultra Vista model
lenovo-3000 Lenovo 3000 y410
+ nec NEC Versa S9100
basic fixed pin assignment w/o SPDIF
auto auto-config reading BIOS (default)
@@ -838,6 +844,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack 3-stack model
toshiba Toshiba A205
acer Acer laptops
+ acer-aspire Acer Aspire One
dell Dell OEM laptops (Vostro 1200)
zepto Zepto laptops
test for testing/debugging purpose, almost all controls can
@@ -847,6 +854,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
ALC269
basic Basic preset
+ quanta Quanta FL1
+ eeepc-p703 ASUS Eeepc P703 P900A
+ eeepc-p901 ASUS Eeepc P901 S101
ALC662/663
3stack-dig 3-stack (2-channel) with SPDIF
@@ -856,10 +866,17 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
lenovo-101e Lenovo laptop
eeepc-p701 ASUS Eeepc P701
eeepc-ep20 ASUS Eeepc EP20
+ ecs ECS/Foxconn mobo
m51va ASUS M51VA
g71v ASUS G71V
h13 ASUS H13
g50v ASUS G50V
+ asus-mode1 ASUS
+ asus-mode2 ASUS
+ asus-mode3 ASUS
+ asus-mode4 ASUS
+ asus-mode5 ASUS
+ asus-mode6 ASUS
auto auto-config reading BIOS (default)
ALC882/885
@@ -891,12 +908,14 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
lenovo-101e Lenovo 101E
lenovo-nb0763 Lenovo NB0763
lenovo-ms7195-dig Lenovo MS7195
+ lenovo-sky Lenovo Sky
haier-w66 Haier W66
3stack-hp HP machines with 3stack (Lucknow, Samba boards)
6stack-dell Dell machines with 6stack (Inspiron 530)
mitac Mitac 8252D
clevo-m720 Clevo M720 laptop series
fujitsu-pi2515 Fujitsu AMILO Pi2515
+ 3stack-6ch-intel Intel DG33* boards
auto auto-config reading BIOS (default)
ALC861/660
@@ -929,7 +948,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
allout 5-jack in back, 2-jack in front, SPDIF out
auto auto-config reading BIOS (default)
- AD1882
+ AD1882 / AD1882A
3stack 3-stack mode (default)
6stack 6-stack mode
@@ -1079,7 +1098,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
register value without FIFO size correction as the current
DMA pointer. position_fix=2 will make the driver to use
the position buffer instead of reading SD_LPIB register.
- (Usually SD_LPLIB register is more accurate than the
+ (Usually SD_LPIB register is more accurate than the
position buffer.)
NB: If you get many "azx_get_response timeout" messages at
@@ -1166,6 +1185,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
* Event Electronics, EZ8
* Digigram VX442
* Lionstracs, Mediastaton
+ * Terrasoniq TS 88
model - Use the given board model, one of the following:
delta1010, dio2496, delta66, delta44, audiophile, delta410,
@@ -1200,7 +1220,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
* TerraTec Phase 22
* TerraTec Phase 28
* AudioTrak Prodigy 7.1
- * AudioTrak Prodigy 7.1LT
+ * AudioTrak Prodigy 7.1 LT
+ * AudioTrak Prodigy 7.1 XT
+ * AudioTrak Prodigy 7.1 HIFI
+ * AudioTrak Prodigy 7.1 HD2
* AudioTrak Prodigy 192
* Pontis MS300
* Albatron K8X800 Pro II
@@ -1211,12 +1234,16 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
* Shuttle SN25P
* Onkyo SE-90PCI
* Onkyo SE-200PCI
+ * ESI Juli@
+ * Hercules Fortissimo IV
+ * EGO-SYS WaveTerminal 192M
model - Use the given board model, one of the following:
revo51, revo71, amp2000, prodigy71, prodigy71lt,
- prodigy192, aureon51, aureon71, universe, ap192,
- k8x800, phase22, phase28, ms300, av710, se200pci,
- se90pci
+ prodigy71xt, prodigy71hifi, prodigyhd2, prodigy192,
+ juli, aureon51, aureon71, universe, ap192, k8x800,
+ phase22, phase28, ms300, av710, se200pci, se90pci,
+ fortissimo4, sn25p, WT192M
This module supports multiple cards and autoprobe.
@@ -1255,7 +1282,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
Module for AC'97 motherboards from Intel and compatibles.
* Intel i810/810E, i815, i820, i830, i84x, MX440
- ICH5, ICH6, ICH7, ESB2
+ ICH5, ICH6, ICH7, 6300ESB, ESB2
* SiS 7012 (SiS 735)
* NVidia NForce, NForce2, NForce3, MCP04, CK804
CK8, CK8S, MCP501
@@ -1951,6 +1978,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
* CHIC True Sound 4Dwave
* Shark Predator4D-PCI
* Jaton SonicWave 4D
+ * SiS SI7018 PCI Audio
+ * Hoontech SoundTrack Digital 4DWave NX
pcm_channels - max channels (voices) reserved for PCM
wavetable_size - max wavetable size in kB (4-?kb)
@@ -1966,12 +1995,25 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
vid - Vendor ID for the device (optional)
pid - Product ID for the device (optional)
+ nrpacks - Max. number of packets per URB (default: 8)
+ async_unlink - Use async unlink mode (default: yes)
device_setup - Device specific magic number (optional)
- Influence depends on the device
- Default: 0x0000
+ ignore_ctl_error - Ignore any USB-controller regarding mixer
+ interface (default: no)
This module supports multiple devices, autoprobe and hotplugging.
+ NB: nrpacks parameter can be modified dynamically via sysfs.
+ Don't put the value over 20. Changing via sysfs has no sanity
+ check.
+ NB: async_unlink=0 would cause Oops. It remains just for
+ debugging purpose (if any).
+ NB: ignore_ctl_error=1 may help when you get an error at accessing
+ the mixer element such as URB error -22. This happens on some
+ buggy USB device or the controller.
+
Module snd-usb-caiaq
--------------------
@@ -2078,7 +2120,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
-------------------
Module for sound cards based on the Asus AV100/AV200 chips,
- i.e., Xonar D1, DX, D2 and D2X.
+ i.e., Xonar D1, DX, D2, D2X and HDAV1.3 (Deluxe).
This module supports autoprobe and multiple cards.
diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index e13c4e67029..87a7c07ab65 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -5073,8 +5073,7 @@ struct _snd_pcm_runtime {
with <constant>SNDRV_DMA_TYPE_CONTINUOUS</constant> type and the
<function>snd_dma_continuous_data(GFP_KERNEL)</function> device pointer,
where <constant>GFP_KERNEL</constant> is the kernel allocation flag to
- use. For the SBUS, <constant>SNDRV_DMA_TYPE_SBUS</constant> and
- <function>snd_dma_sbus_data(sbus_dev)</function> are used instead.
+ use.
For the PCI scatter-gather buffers, use
<constant>SNDRV_DMA_TYPE_DEV_SG</constant> with
<function>snd_dma_pci_data(pci)</function>
@@ -6135,44 +6134,58 @@ struct _snd_pcm_runtime {
</para>
</section>
- <section id="useful-functions-snd-assert">
- <title><function>snd_assert()</function></title>
+ <section id="useful-functions-snd-bug">
+ <title><function>snd_BUG()</function></title>
<para>
- <function>snd_assert()</function> macro is similar with the
- normal <function>assert()</function> macro. For example,
+ It shows the <computeroutput>BUG?</computeroutput> message and
+ stack trace as well as <function>snd_BUG_ON</function> at the point.
+ It's useful to show that a fatal error happens there.
+ </para>
+ <para>
+ When no debug flag is set, this macro is ignored.
+ </para>
+ </section>
+
+ <section id="useful-functions-snd-bug-on">
+ <title><function>snd_BUG_ON()</function></title>
+ <para>
+ <function>snd_BUG_ON()</function> macro is similar with
+ <function>WARN_ON()</function> macro. For example,
<informalexample>
<programlisting>
<![CDATA[
- snd_assert(pointer != NULL, return -EINVAL);
+ snd_BUG_ON(!pointer);
]]>
</programlisting>
</informalexample>
- </para>
- <para>
- The first argument is the expression to evaluate, and the
- second argument is the action if it fails. When
- <constant>CONFIG_SND_DEBUG</constant>, is set, it will show an
- error message such as <computeroutput>BUG? (xxx)</computeroutput>
- together with stack trace.
- </para>
- <para>
- When no debug flag is set, this macro is ignored.
- </para>
- </section>
+ or it can be used as the condition,
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ if (snd_BUG_ON(non_zero_is_bug))
+ return -EINVAL;
+]]>
+ </programlisting>
+ </informalexample>
- <section id="useful-functions-snd-bug">
- <title><function>snd_BUG()</function></title>
- <para>
- It shows the <computeroutput>BUG?</computeroutput> message and
- stack trace as well as <function>snd_assert</function> at the point.
- It's useful to show that a fatal error happens there.
</para>
+
<para>
- When no debug flag is set, this macro is ignored.
+ The macro takes an conditional expression to evaluate.
+ When <constant>CONFIG_SND_DEBUG</constant>, is set, the
+ expression is actually evaluated. If it's non-zero, it shows
+ the warning message such as
+ <computeroutput>BUG? (xxx)</computeroutput>
+ normally followed by stack trace. It returns the evaluated
+ value.
+ When no <constant>CONFIG_SND_DEBUG</constant> is set, this
+ macro always returns zero.
</para>
+
</section>
+
</chapter>
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index b2ed6983f40..46f9684d0b2 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -135,11 +135,7 @@ when the Mic is inserted:-
static int spitz_mic_bias(struct snd_soc_dapm_widget* w, int event)
{
- if(SND_SOC_DAPM_EVENT_ON(event))
- set_scoop_gpio(&spitzscoop2_device.dev, SPITZ_SCP2_MIC_BIAS);
- else
- reset_scoop_gpio(&spitzscoop2_device.dev, SPITZ_SCP2_MIC_BIAS);
-
+ gpio_set_value(SPITZ_GPIO_MIC_BIAS, SND_SOC_DAPM_EVENT_ON(event));
return 0;
}
@@ -269,11 +265,7 @@ powered only when the spk is in use.
/* turn speaker amplifier on/off depending on use */
static int corgi_amp_event(struct snd_soc_dapm_widget *w, int event)
{
- if (SND_SOC_DAPM_EVENT_ON(event))
- set_scoop_gpio(&corgiscoop_device.dev, CORGI_SCP_APM_ON);
- else
- reset_scoop_gpio(&corgiscoop_device.dev, CORGI_SCP_APM_ON);
-
+ gpio_set_value(CORGI_GPIO_APM_ON, SND_SOC_DAPM_EVENT_ON(event));
return 0;
}
diff --git a/Documentation/sparc/sbus_drivers.txt b/Documentation/sparc/sbus_drivers.txt
deleted file mode 100644
index eb1e28ad882..00000000000
--- a/Documentation/sparc/sbus_drivers.txt
+++ /dev/null
@@ -1,309 +0,0 @@
-
- Writing SBUS Drivers
-
- David S. Miller (davem@redhat.com)
-
- The SBUS driver interfaces of the Linux kernel have been
-revamped completely for 2.4.x for several reasons. Foremost were
-performance and complexity concerns. This document details these
-new interfaces and how they are used to write an SBUS device driver.
-
- SBUS drivers need to include <asm/sbus.h> to get access
-to functions and structures described here.
-
- Probing and Detection
-
- Each SBUS device inside the machine is described by a
-structure called "struct sbus_dev". Likewise, each SBUS bus
-found in the system is described by a "struct sbus_bus". For
-each SBUS bus, the devices underneath are hung in a tree-like
-fashion off of the bus structure.
-
- The SBUS device structure contains enough information
-for you to implement your device probing algorithm and obtain
-the bits necessary to run your device. The most commonly
-used members of this structure, and their typical usage,
-will be detailed below.
-
- Here is a piece of skeleton code for performing a device
-probe in an SBUS driver under Linux:
-
- static int __devinit mydevice_probe_one(struct sbus_dev *sdev)
- {
- struct mysdevice *mp = kzalloc(sizeof(*mp), GFP_KERNEL);
-
- if (!mp)
- return -ENODEV;
-
- ...
- dev_set_drvdata(&sdev->ofdev.dev, mp);
- return 0;
- ...
- }
-
- static int __devinit mydevice_probe(struct of_device *dev,
- const struct of_device_id *match)
- {
- struct sbus_dev *sdev = to_sbus_device(&dev->dev);
-
- return mydevice_probe_one(sdev);
- }
-
- static int __devexit mydevice_remove(struct of_device *dev)
- {
- struct sbus_dev *sdev = to_sbus_device(&dev->dev);
- struct mydevice *mp = dev_get_drvdata(&dev->dev);
-
- return mydevice_remove_one(sdev, mp);
- }
-
- static struct of_device_id mydevice_match[] = {
- {
- .name = "mydevice",
- },
- {},
- };
-
- MODULE_DEVICE_TABLE(of, mydevice_match);
-
- static struct of_platform_driver mydevice_driver = {
- .match_table = mydevice_match,
- .probe = mydevice_probe,
- .remove = __devexit_p(mydevice_remove),
- .driver = {
- .name = "mydevice",
- },
- };
-
- static int __init mydevice_init(void)
- {
- return of_register_driver(&mydevice_driver, &sbus_bus_type);
- }
-
- static void __exit mydevice_exit(void)
- {
- of_unregister_driver(&mydevice_driver);
- }
-
- module_init(mydevice_init);
- module_exit(mydevice_exit);
-
- The mydevice_match table is a series of entries which
-describes what SBUS devices your driver is meant for. In the
-simplest case you specify a string for the 'name' field. Every
-SBUS device with a 'name' property matching your string will
-be passed one-by-one to your .probe method.
-
- You should store away your device private state structure
-pointer in the drvdata area so that you can retrieve it later on
-in your .remove method.
-
- Any memory allocated, registers mapped, IRQs registered,
-etc. must be undone by your .remove method so that all resources
-of your device are released by the time it returns.
-
- You should _NOT_ use the for_each_sbus(), for_each_sbusdev(),
-and for_all_sbusdev() interfaces. They are deprecated, will be
-removed, and no new driver should reference them ever.
-
- Mapping and Accessing I/O Registers
-
- Each SBUS device structure contains an array of descriptors
-which describe each register set. We abuse struct resource for that.
-They each correspond to the "reg" properties provided by the OBP firmware.
-
- Before you can access your device's registers you must map
-them. And later if you wish to shutdown your driver (for module
-unload or similar) you must unmap them. You must treat them as
-a resource, which you allocate (map) before using and free up
-(unmap) when you are done with it.
-
- The mapping information is stored in an opaque value
-typed as an "unsigned long". This is the type of the return value
-of the mapping interface, and the arguments to the unmapping
-interface. Let's say you want to map the first set of registers.
-Perhaps part of your driver software state structure looks like:
-
- struct mydevice {
- unsigned long control_regs;
- ...
- struct sbus_dev *sdev;
- ...
- };
-
- At initialization time you then use the sbus_ioremap
-interface to map in your registers, like so:
-
- static void init_one_mydevice(struct sbus_dev *sdev)
- {
- struct mydevice *mp;
- ...
-
- mp->control_regs = sbus_ioremap(&sdev->resource[0], 0,
- CONTROL_REGS_SIZE, "mydevice regs");
- if (!mp->control_regs) {
- /* Failure, cleanup and return. */
- }
- }
-
- Second argument to sbus_ioremap is an offset for
-cranky devices with broken OBP PROM. The sbus_ioremap uses only
-a start address and flags from the resource structure.
-Therefore it is possible to use the same resource to map
-several sets of registers or even to fabricate a resource
-structure if driver gets physical address from some private place.
-This practice is discouraged though. Use whatever OBP PROM
-provided to you.
-
- And here is how you might unmap these registers later at
-driver shutdown or module unload time, using the sbus_iounmap
-interface:
-
- static void mydevice_unmap_regs(struct mydevice *mp)
- {
- sbus_iounmap(mp->control_regs, CONTROL_REGS_SIZE);
- }
-
- Finally, to actually access your registers there are 6
-interface routines at your disposal. Accesses are byte (8 bit),
-word (16 bit), or longword (32 bit) sized. Here they are:
-
- u8 sbus_readb(unsigned long reg) /* read byte */
- u16 sbus_readw(unsigned long reg) /* read word */
- u32 sbus_readl(unsigned long reg) /* read longword */
- void sbus_writeb(u8 value, unsigned long reg) /* write byte */
- void sbus_writew(u16 value, unsigned long reg) /* write word */
- void sbus_writel(u32 value, unsigned long reg) /* write longword */
-
- So, let's say your device has a control register of some sort
-at offset zero. The following might implement resetting your device:
-
- #define CONTROL 0x00UL
-
- #define CONTROL_RESET 0x00000001 /* Reset hardware */
-
- static void mydevice_reset(struct mydevice *mp)
- {
- sbus_writel(CONTROL_RESET, mp->regs + CONTROL);
- }
-
- Or perhaps there is a data port register at an offset of
-16 bytes which allows you to read bytes from a fifo in the device:
-
- #define DATA 0x10UL
-
- static u8 mydevice_get_byte(struct mydevice *mp)
- {
- return sbus_readb(mp->regs + DATA);
- }
-
- It's pretty straightforward, and clueful readers may have
-noticed that these interfaces mimick the PCI interfaces of the
-Linux kernel. This was not by accident.
-
- WARNING:
-
- DO NOT try to treat these opaque register mapping
- values as a memory mapped pointer to some structure
- which you can dereference.
-
- It may be memory mapped, it may not be. In fact it
- could be a physical address, or it could be the time
- of day xor'd with 0xdeadbeef. :-)
-
- Whatever it is, it's an implementation detail. The
- interface was done this way to shield the driver
- author from such complexities.
-
- Doing DVMA
-
- SBUS devices can perform DMA transactions in a way similar
-to PCI but dissimilar to ISA, e.g. DMA masters supply address.
-In contrast to PCI, however, that address (a bus address) is
-translated by IOMMU before a memory access is performed and therefore
-it is virtual. Sun calls this procedure DVMA.
-
- Linux supports two styles of using SBUS DVMA: "consistent memory"
-and "streaming DVMA". CPU view of consistent memory chunk is, well,
-consistent with a view of a device. Think of it as an uncached memory.
-Typically this way of doing DVMA is not very fast and drivers use it
-mostly for control blocks or queues. On some CPUs we cannot flush or
-invalidate individual pages or cache lines and doing explicit flushing
-over ever little byte in every control block would be wasteful.
-
-Streaming DVMA is a preferred way to transfer large amounts of data.
-This process works in the following way:
-1. a CPU stops accessing a certain part of memory,
- flushes its caches covering that memory;
-2. a device does DVMA accesses, then posts an interrupt;
-3. CPU invalidates its caches and starts to access the memory.
-
-A single streaming DVMA operation can touch several discontiguous
-regions of a virtual bus address space. This is called a scatter-gather
-DVMA.
-
-[TBD: Why do not we neither Solaris attempt to map disjoint pages
-into a single virtual chunk with the help of IOMMU, so that non SG
-DVMA masters would do SG? It'd be very helpful for RAID.]
-
- In order to perform a consistent DVMA a driver does something
-like the following:
-
- char *mem; /* Address in the CPU space */
- u32 busa; /* Address in the SBus space */
-
- mem = (char *) sbus_alloc_consistent(sdev, MYMEMSIZE, &busa);
-
- Then mem is used when CPU accesses this memory and u32
-is fed to the device so that it can do DVMA. This is typically
-done with an sbus_writel() into some device register.
-
- Do not forget to free the DVMA resources once you are done:
-
- sbus_free_consistent(sdev, MYMEMSIZE, mem, busa);
-
- Streaming DVMA is more interesting. First you allocate some
-memory suitable for it or pin down some user pages. Then it all works
-like this:
-
- char *mem = argumen1;
- unsigned int size = argument2;
- u32 busa; /* Address in the SBus space */
-
- *mem = 1; /* CPU can access */
- busa = sbus_map_single(sdev, mem, size);
- if (busa == 0) .......
-
- /* Tell the device to use busa here */
- /* CPU cannot access the memory without sbus_dma_sync_single() */
-
- sbus_unmap_single(sdev, busa, size);
- if (*mem == 0) .... /* CPU can access again */
-
- It is possible to retain mappings and ask the device to
-access data again and again without calling sbus_unmap_single.
-However, CPU caches must be invalidated with sbus_dma_sync_single
-before such access.
-
-[TBD but what about writeback caches here... do we have any?]
-
- There is an equivalent set of functions doing the same thing
-only with several memory segments at once for devices capable of
-scatter-gather transfers. Use the Source, Luke.
-
- Examples
-
- drivers/net/sunhme.c
- This is a complicated driver which illustrates many concepts
-discussed above and plus it handles both PCI and SBUS boards.
-
- drivers/scsi/esp.c
- Check it out for scatter-gather DVMA.
-
- drivers/sbus/char/bpp.c
- A non-DVMA device.
-
- drivers/net/sunlance.c
- Lance driver abuses consistent mappings for data transfer.
-It is a nifty trick which we do not particularly recommend...
-Just check it out and know that it's legal.
diff --git a/Documentation/x86/00-INDEX b/Documentation/x86/00-INDEX
new file mode 100644
index 00000000000..dbe3377754a
--- /dev/null
+++ b/Documentation/x86/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+ - this file
+mtrr.txt
+ - how to use x86 Memory Type Range Registers to increase performance
diff --git a/Documentation/x86/i386/boot.txt b/Documentation/x86/boot.txt
index 147bfe511cd..83c0033ee9e 100644
--- a/Documentation/x86/i386/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -308,7 +308,7 @@ Protocol: 2.00+
Field name: start_sys
Type: read
-Offset/size: 0x20c/4
+Offset/size: 0x20c/2
Protocol: 2.00+
The load low segment (0x1000). Obsolete.
diff --git a/Documentation/mtrr.txt b/Documentation/x86/mtrr.txt
index c39ac395970..cc071dc333c 100644
--- a/Documentation/mtrr.txt
+++ b/Documentation/x86/mtrr.txt
@@ -18,7 +18,7 @@ Richard Gooch
The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
MTRRs. These are supported. The AMD Athlon family provide 8 Intel
style MTRRs.
-
+
The Centaur C6 (WinChip) has 8 MCRs, allowing write-combining. These
are supported.
@@ -87,7 +87,7 @@ reg00: base=0x00000000 ( 0MB), size= 64MB: write-back, count=1
reg01: base=0xfb000000 (4016MB), size= 16MB: write-combining, count=1
reg02: base=0xfb000000 (4016MB), size= 4kB: uncachable, count=1
-Some cards (especially Voodoo Graphics boards) need this 4 kB area
+Some cards (especially Voodoo Graphics boards) need this 4 kB area
excluded from the beginning of the region because it is used for
registers.
diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt
index 17965f927c1..c93ff5f4c0d 100644
--- a/Documentation/x86/pat.txt
+++ b/Documentation/x86/pat.txt
@@ -14,6 +14,10 @@ PAT allows for different types of memory attributes. The most commonly used
ones that will be supported at this time are Write-back, Uncached,
Write-combined and Uncached Minus.
+
+PAT APIs
+--------
+
There are many different APIs in the kernel that allows setting of memory
attributes at the page level. In order to avoid aliasing, these interfaces
should be used thoughtfully. Below is a table of interfaces available,
@@ -26,38 +30,38 @@ address range to avoid any aliasing.
API | RAM | ACPI,... | Reserved/Holes |
-----------------------|----------|------------|------------------|
| | | |
-ioremap | -- | UC | UC |
+ioremap | -- | UC- | UC- |
| | | |
ioremap_cache | -- | WB | WB |
| | | |
-ioremap_nocache | -- | UC | UC |
+ioremap_nocache | -- | UC- | UC- |
| | | |
ioremap_wc | -- | -- | WC |
| | | |
-set_memory_uc | UC | -- | -- |
+set_memory_uc | UC- | -- | -- |
set_memory_wb | | | |
| | | |
set_memory_wc | WC | -- | -- |
set_memory_wb | | | |
| | | |
-pci sysfs resource | -- | -- | UC |
+pci sysfs resource | -- | -- | UC- |
| | | |
pci sysfs resource_wc | -- | -- | WC |
is IORESOURCE_PREFETCH| | | |
| | | |
-pci proc | -- | -- | UC |
+pci proc | -- | -- | UC- |
!PCIIOC_WRITE_COMBINE | | | |
| | | |
pci proc | -- | -- | WC |
PCIIOC_WRITE_COMBINE | | | |
| | | |
-/dev/mem | -- | UC | UC |
+/dev/mem | -- | WB/WC/UC- | WB/WC/UC- |
read-write | | | |
| | | |
-/dev/mem | -- | UC | UC |
+/dev/mem | -- | UC- | UC- |
mmap SYNC flag | | | |
| | | |
-/dev/mem | -- | WB/WC/UC | WB/WC/UC |
+/dev/mem | -- | WB/WC/UC- | WB/WC/UC- |
mmap !SYNC flag | |(from exist-| (from exist- |
and | | ing alias)| ing alias) |
any alias to this area| | | |
@@ -68,7 +72,7 @@ pci proc | -- | -- | WC |
and | | | |
MTRR says WB | | | |
| | | |
-/dev/mem | -- | -- | UC_MINUS |
+/dev/mem | -- | -- | UC- |
mmap !SYNC flag | | | |
no alias to this area | | | |
and | | | |
@@ -98,3 +102,35 @@ types.
Drivers should use set_memory_[uc|wc] to set access type for RAM ranges.
+
+PAT debugging
+-------------
+
+With CONFIG_DEBUG_FS enabled, PAT memtype list can be examined by
+
+# mount -t debugfs debugfs /sys/kernel/debug
+# cat /sys/kernel/debug/x86/pat_memtype_list
+PAT memtype list:
+uncached-minus @ 0x7fadf000-0x7fae0000
+uncached-minus @ 0x7fb19000-0x7fb1a000
+uncached-minus @ 0x7fb1a000-0x7fb1b000
+uncached-minus @ 0x7fb1b000-0x7fb1c000
+uncached-minus @ 0x7fb1c000-0x7fb1d000
+uncached-minus @ 0x7fb1d000-0x7fb1e000
+uncached-minus @ 0x7fb1e000-0x7fb25000
+uncached-minus @ 0x7fb25000-0x7fb26000
+uncached-minus @ 0x7fb26000-0x7fb27000
+uncached-minus @ 0x7fb27000-0x7fb28000
+uncached-minus @ 0x7fb28000-0x7fb2e000
+uncached-minus @ 0x7fb2e000-0x7fb2f000
+uncached-minus @ 0x7fb2f000-0x7fb30000
+uncached-minus @ 0x7fb31000-0x7fb32000
+uncached-minus @ 0x80000000-0x90000000
+
+This list shows physical address ranges and various PAT settings used to
+access those physical address ranges.
+
+Another, more verbose way of getting PAT related debug messages is with
+"debugpat" boot parameter. With this parameter, various debug messages are
+printed to dmesg log.
+
diff --git a/Documentation/x86/i386/usb-legacy-support.txt b/Documentation/x86/usb-legacy-support.txt
index 1894cdfc69d..1894cdfc69d 100644
--- a/Documentation/x86/i386/usb-legacy-support.txt
+++ b/Documentation/x86/usb-legacy-support.txt
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index b0c7b6c4abd..72ffb5373ec 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -54,10 +54,6 @@ APICs
apicmaintimer. Useful when your PIT timer is totally
broken.
- disable_8254_timer / enable_8254_timer
- Enable interrupt 0 timer routing over the 8254 in addition to over
- the IO-APIC. The kernel tries to set a sensible default.
-
Early Console
syntax: earlyprintk=vga
diff --git a/Documentation/x86/i386/zero-page.txt b/Documentation/x86/zero-page.txt
index 169ad423a3d..169ad423a3d 100644
--- a/Documentation/x86/i386/zero-page.txt
+++ b/Documentation/x86/zero-page.txt