diff options
Diffstat (limited to 'Documentation')
50 files changed, 1891 insertions, 399 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi new file mode 100644 index 00000000000..5ac1e01bbd4 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -0,0 +1,46 @@ +What: /sys/class/bdi/<bdi>/ +Date: January 2008 +Contact: Peter Zijlstra <a.p.zijlstra@chello.nl> +Description: + +Provide a place in sysfs for the backing_dev_info object. This allows +setting and retrieving various BDI specific variables. + +The <bdi> identifier can be either of the following: + +MAJOR:MINOR + + Device number for block devices, or value of st_dev on + non-block filesystems which provide their own BDI, such as NFS + and FUSE. + +default + + The default backing dev, used for non-block device backed + filesystems which do not provide their own BDI. + +Files under /sys/class/bdi/<bdi>/ +--------------------------------- + +read_ahead_kb (read-write) + + Size of the read-ahead window in kilobytes + +min_ratio (read-write) + + Under normal circumstances each device is given a part of the + total write-back cache that relates to its current average + writeout speed in relation to the other devices. + + The 'min_ratio' parameter allows assigning a minimum + percentage of the write-back cache to a particular device. + For example, this is useful for providing a minimum QoS. + +max_ratio (read-write) + + Allows limiting a particular device to use not more than the + given percentage of the write-back cache. This is useful in + situations where we want to avoid one device taking all or + most of the write-back cache. For example in case of an NFS + mount that is prone to get stuck, or a FUSE mount which cannot + be trusted to play fair. diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index b939ebb6287..80d150458c8 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -145,7 +145,7 @@ Part Ic - DMA addressing limitations int dma_supported(struct device *dev, u64 mask) int -pci_dma_supported(struct device *dev, u64 mask) +pci_dma_supported(struct pci_dev *hwdev, u64 mask) Checks to see if the device can support DMA to the memory described by mask. @@ -189,7 +189,7 @@ dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) dma_addr_t -pci_map_single(struct device *dev, void *cpu_addr, size_t size, +pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size, int direction) Maps a piece of processor virtual memory so it can be accessed by the @@ -395,6 +395,71 @@ Notes: You must do this: See also dma_map_single(). +dma_addr_t +dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) + +void +dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) + +int +dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) + +void +dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) + +The four functions above are just like the counterpart functions +without the _attrs suffixes, except that they pass an optional +struct dma_attrs*. + +struct dma_attrs encapsulates a set of "dma attributes". For the +definition of struct dma_attrs see linux/dma-attrs.h. + +The interpretation of dma attributes is architecture-specific, and +each attribute should be documented in Documentation/DMA-attributes.txt. + +If struct dma_attrs* is NULL, the semantics of each of these +functions is identical to those of the corresponding function +without the _attrs suffix. As a result dma_map_single_attrs() +can generally replace dma_map_single(), etc. + +As an example of the use of the *_attrs functions, here's how +you could pass an attribute DMA_ATTR_FOO when mapping memory +for DMA: + +#include <linux/dma-attrs.h> +/* DMA_ATTR_FOO should be defined in linux/dma-attrs.h and + * documented in Documentation/DMA-attributes.txt */ +... + + DEFINE_DMA_ATTRS(attrs); + dma_set_attr(DMA_ATTR_FOO, &attrs); + .... + n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, &attr); + .... + +Architectures that care about DMA_ATTR_FOO would check for its +presence in their implementations of the mapping and unmapping +routines, e.g.: + +void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + .... + int foo = dma_get_attr(DMA_ATTR_FOO, attrs); + .... + if (foo) + /* twizzle the frobnozzle */ + .... + Part II - Advanced dma_ usage ----------------------------- diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt new file mode 100644 index 00000000000..6d772f84b47 --- /dev/null +++ b/Documentation/DMA-attributes.txt @@ -0,0 +1,24 @@ + DMA attributes + ============== + +This document describes the semantics of the DMA attributes that are +defined in linux/dma-attrs.h. + +DMA_ATTR_WRITE_BARRIER +---------------------- + +DMA_ATTR_WRITE_BARRIER is a (write) barrier attribute for DMA. DMA +to a memory region with the DMA_ATTR_WRITE_BARRIER attribute forces +all pending DMA writes to complete, and thus provides a mechanism to +strictly order DMA from a device across all intervening busses and +bridges. This barrier is not specific to a particular type of +interconnect, it applies to the system as a whole, and so its +implementation must account for the idiosyncracies of the system all +the way from the DMA device to memory. + +As an example of a situation where DMA_ATTR_WRITE_BARRIER would be +useful, suppose that a device does a DMA write to indicate that data is +ready and available in memory. The DMA of the "completion indication" +could race with data DMA. Mapping the memory used for completion +indications with DMA_ATTR_WRITE_BARRIER would prevent the race. + diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt index d84f89dbf92..b463ecd0c7c 100644 --- a/Documentation/DMA-mapping.txt +++ b/Documentation/DMA-mapping.txt @@ -315,11 +315,11 @@ you should do: dma_addr_t dma_handle; - cpu_addr = pci_alloc_consistent(dev, size, &dma_handle); + cpu_addr = pci_alloc_consistent(pdev, size, &dma_handle); -where dev is a struct pci_dev *. You should pass NULL for PCI like buses -where devices don't have struct pci_dev (like ISA, EISA). This may be -called in interrupt context. +where pdev is a struct pci_dev *. This may be called in interrupt context. +You should use dma_alloc_coherent (see DMA-API.txt) for buses +where devices don't have struct pci_dev (like ISA, EISA). This argument is needed because the DMA translations may be bus specific (and often is private to the bus which the device is attached @@ -332,7 +332,7 @@ __get_free_pages (but takes size instead of a page order). If your driver needs regions sized smaller than a page, you may prefer using the pci_pool interface, described below. -The consistent DMA mapping interfaces, for non-NULL dev, will by +The consistent DMA mapping interfaces, for non-NULL pdev, will by default return a DMA address which is SAC (Single Address Cycle) addressable. Even if the device indicates (via PCI dma mask) that it may address the upper 32-bits and thus perform DAC cycles, consistent @@ -354,9 +354,9 @@ buffer you receive will not cross a 64K boundary. To unmap and free such a DMA region, you call: - pci_free_consistent(dev, size, cpu_addr, dma_handle); + pci_free_consistent(pdev, size, cpu_addr, dma_handle); -where dev, size are the same as in the above call and cpu_addr and +where pdev, size are the same as in the above call and cpu_addr and dma_handle are the values pci_alloc_consistent returned to you. This function may not be called in interrupt context. @@ -371,9 +371,9 @@ Create a pci_pool like this: struct pci_pool *pool; - pool = pci_pool_create(name, dev, size, align, alloc); + pool = pci_pool_create(name, pdev, size, align, alloc); -The "name" is for diagnostics (like a kmem_cache name); dev and size +The "name" is for diagnostics (like a kmem_cache name); pdev and size are as above. The device's hardware alignment requirement for this type of data is "align" (which is expressed in bytes, and must be a power of two). If your device has no boundary crossing restrictions, @@ -472,11 +472,11 @@ To map a single region, you do: void *addr = buffer->ptr; size_t size = buffer->len; - dma_handle = pci_map_single(dev, addr, size, direction); + dma_handle = pci_map_single(pdev, addr, size, direction); and to unmap it: - pci_unmap_single(dev, dma_handle, size, direction); + pci_unmap_single(pdev, dma_handle, size, direction); You should call pci_unmap_single when the DMA activity is finished, e.g. from the interrupt which told you that the DMA transfer is done. @@ -493,17 +493,17 @@ Specifically: unsigned long offset = buffer->offset; size_t size = buffer->len; - dma_handle = pci_map_page(dev, page, offset, size, direction); + dma_handle = pci_map_page(pdev, page, offset, size, direction); ... - pci_unmap_page(dev, dma_handle, size, direction); + pci_unmap_page(pdev, dma_handle, size, direction); Here, "offset" means byte offset within the given page. With scatterlists, you map a region gathered from several regions by: - int i, count = pci_map_sg(dev, sglist, nents, direction); + int i, count = pci_map_sg(pdev, sglist, nents, direction); struct scatterlist *sg; for_each_sg(sglist, sg, count, i) { @@ -527,7 +527,7 @@ accessed sg->address and sg->length as shown above. To unmap a scatterlist, just call: - pci_unmap_sg(dev, sglist, nents, direction); + pci_unmap_sg(pdev, sglist, nents, direction); Again, make sure DMA activity has already finished. @@ -550,11 +550,11 @@ correct copy of the DMA buffer. So, firstly, just map it with pci_map_{single,sg}, and after each DMA transfer call either: - pci_dma_sync_single_for_cpu(dev, dma_handle, size, direction); + pci_dma_sync_single_for_cpu(pdev, dma_handle, size, direction); or: - pci_dma_sync_sg_for_cpu(dev, sglist, nents, direction); + pci_dma_sync_sg_for_cpu(pdev, sglist, nents, direction); as appropriate. @@ -562,7 +562,7 @@ Then, if you wish to let the device get at the DMA area again, finish accessing the data with the cpu, and then before actually giving the buffer to the hardware call either: - pci_dma_sync_single_for_device(dev, dma_handle, size, direction); + pci_dma_sync_single_for_device(pdev, dma_handle, size, direction); or: @@ -739,7 +739,7 @@ failure can be determined by: dma_addr_t dma_handle; - dma_handle = pci_map_single(dev, addr, size, direction); + dma_handle = pci_map_single(pdev, addr, size, direction); if (pci_dma_mapping_error(dma_handle)) { /* * reduce current DMA mapping usage, diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 83966e94cc3..0eb0d027eb3 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -12,7 +12,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \ kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ - mac80211.xml + mac80211.xml debugobjects.xml ### # The build process is as follows (targets): diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl new file mode 100644 index 00000000000..7f5f218015f --- /dev/null +++ b/Documentation/DocBook/debugobjects.tmpl @@ -0,0 +1,391 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="debug-objects-guide"> + <bookinfo> + <title>Debug objects life time</title> + + <authorgroup> + <author> + <firstname>Thomas</firstname> + <surname>Gleixner</surname> + <affiliation> + <address> + <email>tglx@linutronix.de</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2008</year> + <holder>Thomas Gleixner</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + debugobjects is a generic infrastructure to track the life time + of kernel objects and validate the operations on those. + </para> + <para> + debugobjects is useful to check for the following error patterns: + <itemizedlist> + <listitem><para>Activation of uninitialized objects</para></listitem> + <listitem><para>Initialization of active objects</para></listitem> + <listitem><para>Usage of freed/destroyed objects</para></listitem> + </itemizedlist> + </para> + <para> + debugobjects is not changing the data structure of the real + object so it can be compiled in with a minimal runtime impact + and enabled on demand with a kernel command line option. + </para> + </chapter> + + <chapter id="howto"> + <title>Howto use debugobjects</title> + <para> + A kernel subsystem needs to provide a data structure which + describes the object type and add calls into the debug code at + appropriate places. The data structure to describe the object + type needs at minimum the name of the object type. Optional + functions can and should be provided to fixup detected problems + so the kernel can continue to work and the debug information can + be retrieved from a live system instead of hard core debugging + with serial consoles and stack trace transcripts from the + monitor. + </para> + <para> + The debug calls provided by debugobjects are: + <itemizedlist> + <listitem><para>debug_object_init</para></listitem> + <listitem><para>debug_object_init_on_stack</para></listitem> + <listitem><para>debug_object_activate</para></listitem> + <listitem><para>debug_object_deactivate</para></listitem> + <listitem><para>debug_object_destroy</para></listitem> + <listitem><para>debug_object_free</para></listitem> + </itemizedlist> + Each of these functions takes the address of the real object and + a pointer to the object type specific debug description + structure. + </para> + <para> + Each detected error is reported in the statistics and a limited + number of errors are printk'ed including a full stack trace. + </para> + <para> + The statistics are available via debugfs/debug_objects/stats. + They provide information about the number of warnings and the + number of successful fixups along with information about the + usage of the internal tracking objects and the state of the + internal tracking objects pool. + </para> + </chapter> + <chapter id="debugfunctions"> + <title>Debug functions</title> + <sect1 id="prototypes"> + <title>Debug object function reference</title> +!Elib/debugobjects.c + </sect1> + <sect1 id="debug_object_init"> + <title>debug_object_init</title> + <para> + This function is called whenever the initialization function + of a real object is called. + </para> + <para> + When the real object is already tracked by debugobjects it is + checked, whether the object can be initialized. Initializing + is not allowed for active and destroyed objects. When + debugobjects detects an error, then it calls the fixup_init + function of the object type description structure if provided + by the caller. The fixup function can correct the problem + before the real initialization of the object happens. E.g. it + can deactivate an active object in order to prevent damage to + the subsystem. + </para> + <para> + When the real object is not yet tracked by debugobjects, + debugobjects allocates a tracker object for the real object + and sets the tracker object state to ODEBUG_STATE_INIT. It + verifies that the object is not on the callers stack. If it is + on the callers stack then a limited number of warnings + including a full stack trace is printk'ed. The calling code + must use debug_object_init_on_stack() and remove the object + before leaving the function which allocated it. See next + section. + </para> + </sect1> + + <sect1 id="debug_object_init_on_stack"> + <title>debug_object_init_on_stack</title> + <para> + This function is called whenever the initialization function + of a real object which resides on the stack is called. + </para> + <para> + When the real object is already tracked by debugobjects it is + checked, whether the object can be initialized. Initializing + is not allowed for active and destroyed objects. When + debugobjects detects an error, then it calls the fixup_init + function of the object type description structure if provided + by the caller. The fixup function can correct the problem + before the real initialization of the object happens. E.g. it + can deactivate an active object in order to prevent damage to + the subsystem. + </para> + <para> + When the real object is not yet tracked by debugobjects + debugobjects allocates a tracker object for the real object + and sets the tracker object state to ODEBUG_STATE_INIT. It + verifies that the object is on the callers stack. + </para> + <para> + An object which is on the stack must be removed from the + tracker by calling debug_object_free() before the function + which allocates the object returns. Otherwise we keep track of + stale objects. + </para> + </sect1> + + <sect1 id="debug_object_activate"> + <title>debug_object_activate</title> + <para> + This function is called whenever the activation function of a + real object is called. + </para> + <para> + When the real object is already tracked by debugobjects it is + checked, whether the object can be activated. Activating is + not allowed for active and destroyed objects. When + debugobjects detects an error, then it calls the + fixup_activate function of the object type description + structure if provided by the caller. The fixup function can + correct the problem before the real activation of the object + happens. E.g. it can deactivate an active object in order to + prevent damage to the subsystem. + </para> + <para> + When the real object is not yet tracked by debugobjects then + the fixup_activate function is called if available. This is + necessary to allow the legitimate activation of statically + allocated and initialized objects. The fixup function checks + whether the object is valid and calls the debug_objects_init() + function to initialize the tracking of this object. + </para> + <para> + When the activation is legitimate, then the state of the + associated tracker object is set to ODEBUG_STATE_ACTIVE. + </para> + </sect1> + + <sect1 id="debug_object_deactivate"> + <title>debug_object_deactivate</title> + <para> + This function is called whenever the deactivation function of + a real object is called. + </para> + <para> + When the real object is tracked by debugobjects it is checked, + whether the object can be deactivated. Deactivating is not + allowed for untracked or destroyed objects. + </para> + <para> + When the deactivation is legitimate, then the state of the + associated tracker object is set to ODEBUG_STATE_INACTIVE. + </para> + </sect1> + + <sect1 id="debug_object_destroy"> + <title>debug_object_destroy</title> + <para> + This function is called to mark an object destroyed. This is + useful to prevent the usage of invalid objects, which are + still available in memory: either statically allocated objects + or objects which are freed later. + </para> + <para> + When the real object is tracked by debugobjects it is checked, + whether the object can be destroyed. Destruction is not + allowed for active and destroyed objects. When debugobjects + detects an error, then it calls the fixup_destroy function of + the object type description structure if provided by the + caller. The fixup function can correct the problem before the + real destruction of the object happens. E.g. it can deactivate + an active object in order to prevent damage to the subsystem. + </para> + <para> + When the destruction is legitimate, then the state of the + associated tracker object is set to ODEBUG_STATE_DESTROYED. + </para> + </sect1> + + <sect1 id="debug_object_free"> + <title>debug_object_free</title> + <para> + This function is called before an object is freed. + </para> + <para> + When the real object is tracked by debugobjects it is checked, + whether the object can be freed. Free is not allowed for + active objects. When debugobjects detects an error, then it + calls the fixup_free function of the object type description + structure if provided by the caller. The fixup function can + correct the problem before the real free of the object + happens. E.g. it can deactivate an active object in order to + prevent damage to the subsystem. + </para> + <para> + Note that debug_object_free removes the object from the + tracker. Later usage of the object is detected by the other + debug checks. + </para> + </sect1> + </chapter> + <chapter id="fixupfunctions"> + <title>Fixup functions</title> + <sect1 id="debug_obj_descr"> + <title>Debug object type description structure</title> +!Iinclude/linux/debugobjects.h + </sect1> + <sect1 id="fixup_init"> + <title>fixup_init</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_init is detected. The function takes the + address of the object and the state which is currently + recorded in the tracker. + </para> + <para> + Called from debug_object_init when the object state is: + <itemizedlist> + <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem> + </itemizedlist> + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + <para> + Note, that the function needs to call the debug_object_init() + function again, after the damage has been repaired in order to + keep the state consistent. + </para> + </sect1> + + <sect1 id="fixup_activate"> + <title>fixup_activate</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_activate is detected. + </para> + <para> + Called from debug_object_activate when the object state is: + <itemizedlist> + <listitem><para>ODEBUG_STATE_NOTAVAILABLE</para></listitem> + <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem> + </itemizedlist> + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + <para> + Note that the function needs to call the debug_object_activate() + function again after the damage has been repaired in order to + keep the state consistent. + </para> + <para> + The activation of statically initialized objects is a special + case. When debug_object_activate() has no tracked object for + this object address then fixup_activate() is called with + object state ODEBUG_STATE_NOTAVAILABLE. The fixup function + needs to check whether this is a legitimate case of a + statically initialized object or not. In case it is it calls + debug_object_init() and debug_object_activate() to make the + object known to the tracker and marked active. In this case + the function should return 0 because this is not a real fixup. + </para> + </sect1> + + <sect1 id="fixup_destroy"> + <title>fixup_destroy</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_destroy is detected. + </para> + <para> + Called from debug_object_destroy when the object state is: + <itemizedlist> + <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem> + </itemizedlist> + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + </sect1> + <sect1 id="fixup_free"> + <title>fixup_free</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_free is detected. Further it can be called + from the debug checks in kfree/vfree, when an active object is + detected from the debug_check_no_obj_freed() sanity checks. + </para> + <para> + Called from debug_object_free() or debug_check_no_obj_freed() + when the object state is: + <itemizedlist> + <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem> + </itemizedlist> + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + </sect1> + </chapter> + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + None (knock on wood). + </para> + </chapter> +</book> diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index 488dd4a4945..b7b1482f6e0 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -119,7 +119,7 @@ X!Ilib/string.c !Elib/string.c </sect1> <sect1><title>Bit Operations</title> -!Iinclude/asm-x86/bitops_32.h +!Iinclude/asm-x86/bitops.h </sect1> </chapter> @@ -645,4 +645,58 @@ X!Idrivers/video/console/fonts.c !Edrivers/i2c/i2c-core.c </chapter> + <chapter id="clk"> + <title>Clock Framework</title> + + <para> + The clock framework defines programming interfaces to support + software management of the system clock tree. + This framework is widely used with System-On-Chip (SOC) platforms + to support power management and various devices which may need + custom clock rates. + Note that these "clocks" don't relate to timekeeping or real + time clocks (RTCs), each of which have separate frameworks. + These <structname>struct clk</structname> instances may be used + to manage for example a 96 MHz signal that is used to shift bits + into and out of peripherals or busses, or otherwise trigger + synchronous state machine transitions in system hardware. + </para> + + <para> + Power management is supported by explicit software clock gating: + unused clocks are disabled, so the system doesn't waste power + changing the state of transistors that aren't in active use. + On some systems this may be backed by hardware clock gating, + where clocks are gated without being disabled in software. + Sections of chips that are powered but not clocked may be able + to retain their last state. + This low power state is often called a <emphasis>retention + mode</emphasis>. + This mode still incurs leakage currents, especially with finer + circuit geometries, but for CMOS circuits power is mostly used + by clocked state changes. + </para> + + <para> + Power-aware drivers only enable their clocks when the device + they manage is in active use. Also, system sleep states often + differ according to which clock domains are active: while a + "standby" state may allow wakeup from several active domains, a + "mem" (suspend-to-RAM) state may require a more wholesale shutdown + of clocks derived from higher speed PLLs and oscillators, limiting + the number of possible wakeup event sources. A driver's suspend + method may need to be aware of system-specific clock constraints + on the target sleep state. + </para> + + <para> + Some platforms support programmable clock generators. These + can be used by external chips of various kinds, such as other + CPUs, multimedia codecs, and devices with strict requirements + for interface clocking. + </para> + +!Iinclude/linux/clk.h + </chapter> + </book> diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl index b9e143e28c6..54eb26b5737 100644 --- a/Documentation/DocBook/rapidio.tmpl +++ b/Documentation/DocBook/rapidio.tmpl @@ -133,7 +133,6 @@ !Idrivers/rapidio/rio-sysfs.c </sect1> <sect1 id="PPC32_support"><title>PPC32 support</title> -!Iarch/powerpc/kernel/rio.c !Earch/powerpc/sysdev/fsl_rio.c !Iarch/powerpc/sysdev/fsl_rio.c </sect1> diff --git a/Documentation/braille-console.txt b/Documentation/braille-console.txt new file mode 100644 index 00000000000..000b0fbdc10 --- /dev/null +++ b/Documentation/braille-console.txt @@ -0,0 +1,34 @@ + Linux Braille Console + +To get early boot messages on a braille device (before userspace screen +readers can start), you first need to compile the support for the usual serial +console (see serial-console.txt), and for braille device (in Device Drivers - +Accessibility). + +Then you need to specify a console=brl, option on the kernel command line, the +format is: + + console=brl,serial_options... + +where serial_options... are the same as described in serial-console.txt + +So for instance you can use console=brl,ttyS0 if the braille device is connected +to the first serial port, and console=brl,ttyS0,115200 to override the baud rate +to 115200, etc. + +By default, the braille device will just show the last kernel message (console +mode). To review previous messages, press the Insert key to switch to the VT +review mode. In review mode, the arrow keys permit to browse in the VT content, +page up/down keys go at the top/bottom of the screen, and the home key goes back +to the cursor, hence providing very basic screen reviewing facility. + +Sound feedback can be obtained by adding the braille_console.sound=1 kernel +parameter. + +For simplicity, only one braille console can be enabled, other uses of +console=brl,... will be discarded. Also note that it does not interfere with +the console selection mecanism described in serial-console.txt + +For now, only the VisioBraille device is supported. + +Samuel Thibault <samuel.thibault@ens-lyon.org> diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt index 31d12e21ff8..c298a6690e0 100644 --- a/Documentation/cgroups.txt +++ b/Documentation/cgroups.txt @@ -500,8 +500,7 @@ post-attachment activity that requires memory allocations or blocking. void fork(struct cgroup_subsy *ss, struct task_struct *task) -Called when a task is forked into a cgroup. Also called during -registration for all existing tasks. +Called when a task is forked into a cgroup. void exit(struct cgroup_subsys *ss, struct task_struct *task) diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt new file mode 100644 index 00000000000..4dcea42432c --- /dev/null +++ b/Documentation/controllers/devices.txt @@ -0,0 +1,48 @@ +Device Whitelist Controller + +1. Description: + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied its parent. However +when a device access is removed from a parent it will not also be +removed from the child(ren). + +2. User Interface + +An entry is added using devices.allow, and removed using +devices.deny. For instance + + echo 'c 1:3 mr' > /cgroups/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing + + echo a > /cgroups/1/devices.deny + +will remove the default 'a *:* mrw' entry. + +3. Security + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendent of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. diff --git a/Documentation/controllers/resource_counter.txt b/Documentation/controllers/resource_counter.txt new file mode 100644 index 00000000000..f196ac1d7d2 --- /dev/null +++ b/Documentation/controllers/resource_counter.txt @@ -0,0 +1,181 @@ + + The Resource Counter + +The resource counter, declared at include/linux/res_counter.h, +is supposed to facilitate the resource management by controllers +by providing common stuff for accounting. + +This "stuff" includes the res_counter structure and routines +to work with it. + + + +1. Crucial parts of the res_counter structure + + a. unsigned long long usage + + The usage value shows the amount of a resource that is consumed + by a group at a given time. The units of measurement should be + determined by the controller that uses this counter. E.g. it can + be bytes, items or any other unit the controller operates on. + + b. unsigned long long max_usage + + The maximal value of the usage over time. + + This value is useful when gathering statistical information about + the particular group, as it shows the actual resource requirements + for a particular group, not just some usage snapshot. + + c. unsigned long long limit + + The maximal allowed amount of resource to consume by the group. In + case the group requests for more resources, so that the usage value + would exceed the limit, the resource allocation is rejected (see + the next section). + + d. unsigned long long failcnt + + The failcnt stands for "failures counter". This is the number of + resource allocation attempts that failed. + + c. spinlock_t lock + + Protects changes of the above values. + + + +2. Basic accounting routines + + a. void res_counter_init(struct res_counter *rc) + + Initializes the resource counter. As usual, should be the first + routine called for a new counter. + + b. int res_counter_charge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is about to be allocated it has to be accounted + with the appropriate resource counter (controller should determine + which one to use on its own). This operation is called "charging". + + This is not very important which operation - resource allocation + or charging - is performed first, but + * if the allocation is performed first, this may create a + temporary resource over-usage by the time resource counter is + charged; + * if the charging is performed first, then it should be uncharged + on error path (if the one is called). + + c. void res_counter_uncharge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is released (freed) it should be de-accounted + from the resource counter it was accounted to. This is called + "uncharging". + + The _locked routines imply that the res_counter->lock is taken. + + + 2.1 Other accounting routines + + There are more routines that may help you with common needs, like + checking whether the limit is reached or resetting the max_usage + value. They are all declared in include/linux/res_counter.h. + + + +3. Analyzing the resource counter registrations + + a. If the failcnt value constantly grows, this means that the counter's + limit is too tight. Either the group is misbehaving and consumes too + many resources, or the configuration is not suitable for the group + and the limit should be increased. + + b. The max_usage value can be used to quickly tune the group. One may + set the limits to maximal values and either load the container with + a common pattern or leave one for a while. After this the max_usage + value shows the amount of memory the container would require during + its common activity. + + Setting the limit a bit above this value gives a pretty good + configuration that works in most of the cases. + + c. If the max_usage is much less than the limit, but the failcnt value + is growing, then the group tries to allocate a big chunk of resource + at once. + + d. If the max_usage is much less than the limit, but the failcnt value + is 0, then this group is given too high limit, that it does not + require. It is better to lower the limit a bit leaving more resource + for other groups. + + + +4. Communication with the control groups subsystem (cgroups) + +All the resource controllers that are using cgroups and resource counters +should provide files (in the cgroup filesystem) to work with the resource +counter fields. They are recommended to adhere to the following rules: + + a. File names + + Field name File name + --------------------------------------------------- + usage usage_in_<unit_of_measurement> + max_usage max_usage_in_<unit_of_measurement> + limit limit_in_<unit_of_measurement> + failcnt failcnt + lock no file :) + + b. Reading from file should show the corresponding field value in the + appropriate format. + + c. Writing to file + + Field Expected behavior + ---------------------------------- + usage prohibited + max_usage reset to usage + limit set the limit + failcnt reset to zero + + + +5. Usage example + + a. Declare a task group (take a look at cgroups subsystem for this) and + fold a res_counter into it + + struct my_group { + struct res_counter res; + + <other fields> + } + + b. Put hooks in resource allocation/release paths + + int alloc_something(...) + { + if (res_counter_charge(res_counter_ptr, amount) < 0) + return -ENOMEM; + + <allocate the resource and return to the caller> + } + + void release_something(...) + { + res_counter_uncharge(res_counter_ptr, amount); + + <release the resource> + } + + In order to keep the usage value self-consistent, both the + "res_counter_ptr" and the "amount" in release_something() should be + the same as they were in the alloc_something() when the releasing + resource was allocated. + + c. Provide the way to read res_counter values and set them (the cgroups + still can help with it). + + c. Compile and run :) diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt index af3b925ece0..6c442d8426b 100644 --- a/Documentation/cpu-freq/user-guide.txt +++ b/Documentation/cpu-freq/user-guide.txt @@ -154,6 +154,11 @@ scaling_governor, and by "echoing" the name of another that some governors won't load - they only work on some specific architectures or processors. + +cpuinfo_cur_freq : Current speed of the CPU, in KHz. + +scaling_available_frequencies : List of available frequencies, in KHz. + scaling_min_freq and scaling_max_freq show the current "policy limits" (in kHz). By echoing new values into these @@ -162,6 +167,15 @@ scaling_max_freq show the current "policy limits" (in first set scaling_max_freq, then scaling_min_freq. +affected_cpus : List of CPUs that require software coordination + of frequency. + +related_cpus : List of CPUs that need some sort of frequency + coordination, whether software or hardware. + +scaling_driver : Hardware driver for cpufreq. + +scaling_cur_freq : Current frequency of the CPU, in KHz. If you have selected the "userspace" governor which allows you to set the CPU operating frequency to a specific value, you can read out diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index aa854b9b18c..fb7b361e6ee 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -171,6 +171,7 @@ files describing that cpuset: - memory_migrate flag: if set, move pages to cpusets nodes - cpu_exclusive flag: is cpu placement exclusive? - mem_exclusive flag: is memory placement exclusive? + - mem_hardwall flag: is memory allocation hardwalled - memory_pressure: measure of how much paging pressure in cpuset In addition, the root cpuset only has the following file: @@ -222,17 +223,18 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct ancestor or descendent, may share any of the same CPUs or Memory Nodes. -A cpuset that is mem_exclusive restricts kernel allocations for -page, buffer and other data commonly shared by the kernel across -multiple users. All cpusets, whether mem_exclusive or not, restrict -allocations of memory for user space. This enables configuring a -system so that several independent jobs can share common kernel data, -such as file system pages, while isolating each jobs user allocation in -its own cpuset. To do this, construct a large mem_exclusive cpuset to -hold all the jobs, and construct child, non-mem_exclusive cpusets for -each individual job. Only a small amount of typical kernel memory, -such as requests from interrupt handlers, is allowed to be taken -outside even a mem_exclusive cpuset. +A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. 1.5 What is memory_pressure ? @@ -707,7 +709,7 @@ Now you want to do something with this cpuset. In this directory you can find several files: # ls -cpus cpu_exclusive mems mem_exclusive tasks +cpus cpu_exclusive mems mem_exclusive mem_hardwall tasks Reading them will give you information about the state of this cpuset: the CPUs and Memory Nodes it can use, the processes that are using diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 354aec047c0..881e6dd03ae 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -141,6 +141,7 @@ mkprep mktables mktree modpost +modules.order modversions.h* offset.h offsets.h @@ -171,6 +172,7 @@ sm_tbl* split-include tags tftpboot.img +timeconst.h times.h* tkparse trix_boot.h diff --git a/Documentation/fb/gxfb.txt b/Documentation/fb/gxfb.txt new file mode 100644 index 00000000000..2f640903bbb --- /dev/null +++ b/Documentation/fb/gxfb.txt @@ -0,0 +1,52 @@ +[This file is cloned from VesaFB/aty128fb] + +What is gxfb? +================= + +This is a graphics framebuffer driver for AMD Geode GX2 based processors. + +Advantages: + + * No need to use AMD's VSA code (or other VESA emulation layer) in the + BIOS. + * It provides a nice large console (128 cols + 48 lines with 1024x768) + without using tiny, unreadable fonts. + * You can run XF68_FBDev on top of /dev/fb0 + * Most important: boot logo :-) + +Disadvantages: + + * graphic mode is slower than text mode... + + +How to use it? +============== + +Switching modes is done using gxfb.mode_option=<resolution>... boot +parameter or using `fbset' program. + +See Documentation/fb/modedb.txt for more information on modedb +resolutions. + + +X11 +=== + +XF68_FBDev should generally work fine, but it is non-accelerated. + + +Configuration +============= + +You can pass kernel command line options to gxfb with gxfb.<option>. +For example, gxfb.mode_option=800x600@75. +Accepted options: + +mode_option - specify the video mode. Of the form + <x>x<y>[-<bpp>][@<refresh>] +vram - size of video ram (normally auto-detected) +vt_switch - enable vt switching during suspend/resume. The vt + switch is slow, but harmless. + +-- +Andres Salomon <dilinger@debian.org> diff --git a/Documentation/fb/intelfb.txt b/Documentation/fb/intelfb.txt index da5ee74219e..27a3160650a 100644 --- a/Documentation/fb/intelfb.txt +++ b/Documentation/fb/intelfb.txt @@ -14,6 +14,8 @@ graphics devices. These would include: Intel 915GM Intel 945G Intel 945GM + Intel 965G + Intel 965GM B. List of available options diff --git a/Documentation/fb/lxfb.txt b/Documentation/fb/lxfb.txt new file mode 100644 index 00000000000..38b3ca6f6ca --- /dev/null +++ b/Documentation/fb/lxfb.txt @@ -0,0 +1,52 @@ +[This file is cloned from VesaFB/aty128fb] + +What is lxfb? +================= + +This is a graphics framebuffer driver for AMD Geode LX based processors. + +Advantages: + + * No need to use AMD's VSA code (or other VESA emulation layer) in the + BIOS. + * It provides a nice large console (128 cols + 48 lines with 1024x768) + without using tiny, unreadable fonts. + * You can run XF68_FBDev on top of /dev/fb0 + * Most important: boot logo :-) + +Disadvantages: + + * graphic mode is slower than text mode... + + +How to use it? +============== + +Switching modes is done using lxfb.mode_option=<resolution>... boot +parameter or using `fbset' program. + +See Documentation/fb/modedb.txt for more information on modedb +resolutions. + + +X11 +=== + +XF68_FBDev should generally work fine, but it is non-accelerated. + + +Configuration +============= + +You can pass kernel command line options to lxfb with lxfb.<option>. +For example, lxfb.mode_option=800x600@75. +Accepted options: + +mode_option - specify the video mode. Of the form + <x>x<y>[-<bpp>][@<refresh>] +vram - size of video ram (normally auto-detected) +vt_switch - enable vt switching during suspend/resume. The vt + switch is slow, but harmless. + +-- +Andres Salomon <dilinger@debian.org> diff --git a/Documentation/fb/metronomefb.txt b/Documentation/fb/metronomefb.txt index b9a2e7b7e83..237ca412582 100644 --- a/Documentation/fb/metronomefb.txt +++ b/Documentation/fb/metronomefb.txt @@ -1,7 +1,7 @@ Metronomefb ----------- Maintained by Jaya Kumar <jayakumar.lkml.gmail.com> -Last revised: Nov 20, 2007 +Last revised: Mar 10, 2008 Metronomefb is a driver for the Metronome display controller. The controller is from E-Ink Corporation. It is intended to be used to drive the E-Ink @@ -11,20 +11,18 @@ display media here http://www.e-ink.com/products/matrix/metronome.html . Metronome is interfaced to the host CPU through the AMLCD interface. The host CPU generates the control information and the image in a framebuffer which is then delivered to the AMLCD interface by a host specific method. -Currently, that's implemented for the PXA's LCDC controller. The display and -error status are each pulled through individual GPIOs. +The display and error status are each pulled through individual GPIOs. -Metronomefb was written for the PXA255/gumstix/lyre combination and -therefore currently has board set specific code in it. If other boards based on -other architectures are available, then the host specific code can be separated -and abstracted out. +Metronomefb is platform independent and depends on a board specific driver +to do all physical IO work. Currently, an example is implemented for the +PXA board used in the AM-200 EPD devkit. This example is am200epd.c Metronomefb requires waveform information which is delivered via the AMLCD interface to the metronome controller. The waveform information is expected to be delivered from userspace via the firmware class interface. The waveform file can be compressed as long as your udev or hotplug script is aware of the need -to uncompress it before delivering it. metronomefb will ask for waveform.wbf -which would typically go into /lib/firmware/waveform.wbf depending on your +to uncompress it before delivering it. metronomefb will ask for metronome.wbf +which would typically go into /lib/firmware/metronome.wbf depending on your udev/hotplug setup. I have only tested with a single waveform file which was originally labeled 23P01201_60_WT0107_MTC. I do not know what it stands for. Caution should be exercised when manipulating the waveform as there may be diff --git a/Documentation/fb/modedb.txt b/Documentation/fb/modedb.txt index 4fcdb4cf4cc..ec4dee75a35 100644 --- a/Documentation/fb/modedb.txt +++ b/Documentation/fb/modedb.txt @@ -125,8 +125,12 @@ There may be more modes. amifb - Amiga chipset frame buffer aty128fb - ATI Rage128 / Pro frame buffer atyfb - ATI Mach64 frame buffer + pm2fb - Permedia 2/2V frame buffer + pm3fb - Permedia 3 frame buffer + sstfb - Voodoo 1/2 (SST1) chipset frame buffer tdfxfb - 3D Fx frame buffer tridentfb - Trident (Cyber)blade chipset frame buffer + vt8623fb - VIA 8623 frame buffer BTW, only a few drivers use this at the moment. Others are to follow (feel free to send patches). diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 448729fcaeb..3c35d452b1a 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -128,15 +128,6 @@ Who: Arjan van de Ven <arjan@linux.intel.com> --------------------------- -What: vm_ops.nopage -When: Soon, provided in-kernel callers have been converted -Why: This interface is replaced by vm_ops.fault, but it has been around - forever, is used by a lot of drivers, and doesn't cost much to - maintain. -Who: Nick Piggin <npiggin@suse.de> - ---------------------------- - What: PHYSDEVPATH, PHYSDEVBUS, PHYSDEVDRIVER in the uevent environment When: October 2008 Why: The stacking of class devices makes these values misleading and @@ -147,6 +138,24 @@ Who: Kay Sievers <kay.sievers@suse.de> --------------------------- +What: find_task_by_pid +When: 2.6.26 +Why: With pid namespaces, calling this funciton will return the + wrong task when called from inside a namespace. + + The best way to save a task pid and find a task by this + pid later, is to find this task's struct pid pointer (or get + it directly from the task) and call pid_task() later. + + If someone really needs to get a task by its pid_t, then + he most likely needs the find_task_by_vpid() to get the + task from the same namespace as the current task is in, but + this may be not so in general. + +Who: Pavel Emelyanov <xemul@openvz.org> + +--------------------------- + What: ACPI procfs interface When: July 2008 Why: ACPI sysfs conversion should be finished by January 2008. diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 42d4b30b104..c2992bc54f2 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -511,7 +511,6 @@ prototypes: void (*open)(struct vm_area_struct*); void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); - struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *); int (*page_mkwrite)(struct vm_area_struct *, struct page *); locking rules: @@ -519,7 +518,6 @@ locking rules: open: no yes close: no yes fault: no yes -nopage: no yes page_mkwrite: no yes no ->page_mkwrite() is called when a previously read-only page is @@ -537,4 +535,3 @@ NULL. ipc/shm.c::shm_delete() - may need BKL. ->read() and ->write() in many drivers are (probably) missing BKL. -drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 2a99116edc4..dbc3c6a3650 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -463,11 +463,17 @@ SwapTotal: 0 kB SwapFree: 0 kB Dirty: 968 kB Writeback: 0 kB +AnonPages: 861800 kB Mapped: 280372 kB -Slab: 684068 kB +Slab: 284364 kB +SReclaimable: 159856 kB +SUnreclaim: 124508 kB +PageTables: 24448 kB +NFS_Unstable: 0 kB +Bounce: 0 kB +WritebackTmp: 0 kB CommitLimit: 7669796 kB Committed_AS: 100056 kB -PageTables: 24448 kB VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB @@ -503,8 +509,17 @@ VmallocChunk: 111088 kB on the disk Dirty: Memory which is waiting to get written back to the disk Writeback: Memory which is actively being written back to the disk + AnonPages: Non-file backed pages mapped into userspace page tables Mapped: files which have been mmaped, such as libraries Slab: in-kernel data structures cache +SReclaimable: Part of Slab, that might be reclaimed, such as caches + SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure + PageTables: amount of memory dedicated to the lowest level of page + tables. +NFS_Unstable: NFS pages sent to the server, but not yet committed to stable + storage + Bounce: Memory used for block device "bounce buffers" +WritebackTmp: Memory used by FUSE for temporary writeback buffers CommitLimit: Based on the overcommit ratio ('vm.overcommit_ratio'), this is the total amount of memory currently available to be allocated on the system. This limit is only adhered to @@ -531,8 +546,6 @@ Committed_AS: The amount of memory presently allocated on the system. above) will not be permitted. This is useful if one needs to guarantee that processes will not fail due to lack of memory once that memory has been successfully allocated. - PageTables: amount of memory dedicated to the lowest level of page - tables. VmallocTotal: total size of vmalloc memory area VmallocUsed: amount of vmalloc area which is used VmallocChunk: largest contigious block of vmalloc area which is free diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index 145e4408635..222437efd75 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -92,6 +92,18 @@ NodeList format is a comma-separated list of decimal numbers and ranges, a range being two hyphen-separated decimal numbers, the smallest and largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 +NUMA memory allocation policies have optional flags that can be used in +conjunction with their modes. These optional flags can be specified +when tmpfs is mounted by appending them to the mode before the NodeList. +See Documentation/vm/numa_memory_policy.txt for a list of all available +memory allocation policy mode flags. + + =static is equivalent to MPOL_F_STATIC_NODES + =relative is equivalent to MPOL_F_RELATIVE_NODES + +For example, mpol=bind=static:NodeList, is the equivalent of an +allocation policy of MPOL_BIND | MPOL_F_STATIC_NODES. + Note that trying to mount a tmpfs with an mpol option will fail if the running kernel does not support NUMA; and will fail if its nodelist specifies a node which is not online. If your system relies on that diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index fcc123ffa25..2d5e1e582e1 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -17,6 +17,21 @@ dmask=### -- The permission mask for the directory. fmask=### -- The permission mask for files. The default is the umask of current process. +allow_utime=### -- This option controls the permission check of mtime/atime. + + 20 - If current process is in group of file's group ID, + you can change timestamp. + 2 - Other users can change timestamp. + + The default is set from `dmask' option. (If the directory is + writable, utime(2) is also allowed. I.e. ~dmask & 022) + + Normally utime(2) checks current process is owner of + the file, or it has CAP_FOWNER capability. But FAT + filesystem doesn't have uid/gid on disk, so normal + check is too unflexible. With this option you can + relax it. + codepage=### -- Sets the codepage number for converting to shortname characters on FAT filesystem. By default, FAT_DEFAULT_CODEPAGE setting is used. diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index 54630095aa3..c35ca9e40d4 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt @@ -107,6 +107,16 @@ type of GPIO controller, and on one particular board 80-95 with an FPGA. The numbers need not be contiguous; either of those platforms could also use numbers 2000-2063 to identify GPIOs in a bank of I2C GPIO expanders. +If you want to initialize a structure with an invalid GPIO number, use +some negative number (perhaps "-EINVAL"); that will never be valid. To +test if a number could reference a GPIO, you may use this predicate: + + int gpio_is_valid(int number); + +A number that's not valid will be rejected by calls which may request +or free GPIOs (see below). Other numbers may also be rejected; for +example, a number might be valid but unused on a given board. + Whether a platform supports multiple GPIO controllers is currently a platform-specific implementation issue. diff --git a/Documentation/hwmon/w83l785ts b/Documentation/hwmon/w83l785ts index 1841cedc25b..bd1fa9d4468 100644 --- a/Documentation/hwmon/w83l785ts +++ b/Documentation/hwmon/w83l785ts @@ -33,7 +33,8 @@ Known Issues ------------ On some systems (Asus), the BIOS is known to interfere with the driver -and cause read errors. The driver will retry a given number of times +and cause read errors. Or maybe the W83L785TS-S chip is simply unreliable, +we don't really know. The driver will retry a given number of times (5 by default) and then give up, returning the old value (or 0 if there is no old value). It seems to work well enough so that you should not notice anything. Thanks to James Bolt for helping test this feature. diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients index bfb0a552081..ee75cbace28 100644 --- a/Documentation/i2c/writing-clients +++ b/Documentation/i2c/writing-clients @@ -164,7 +164,8 @@ I2C device drivers using this binding model work just like any other kind of driver in Linux: they provide a probe() method to bind to those devices, and a remove() method to unbind. - static int foo_probe(struct i2c_client *client); + static int foo_probe(struct i2c_client *client, + const struct i2c_device_id *id); static int foo_remove(struct i2c_client *client); Remember that the i2c_driver does not create those client handles. The diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt index 0fac3465f2e..95ad15c3b01 100644 --- a/Documentation/i386/boot.txt +++ b/Documentation/i386/boot.txt @@ -40,9 +40,17 @@ Protocol 2.05: (Kernel 2.6.20) Make protected mode kernel relocatable. Introduce relocatable_kernel and kernel_alignment fields. Protocol 2.06: (Kernel 2.6.22) Added a field that contains the size of - the boot command line + the boot command line. -Protocol 2.09: (kernel 2.6.26) Added a field of 64-bit physical +Protocol 2.07: (Kernel 2.6.24) Added paravirtualised boot protocol. + Introduced hardware_subarch and hardware_subarch_data + and KEEP_SEGMENTS flag in load_flags. + +Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format + payload. Introduced payload_offset and payload length + fields to aid in locating the payload. + +Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical pointer to single linked list of struct setup_data. **** MEMORY LAYOUT diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index 649cb879989..00b950d1c19 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt @@ -104,14 +104,15 @@ applicable everywhere (see syntax). Reverse dependencies can only be used with boolean or tristate symbols. Note: - select is evil.... select will by brute force set a symbol - equal to 'y' without visiting the dependencies. So abusing - select you are able to select a symbol FOO even if FOO depends - on BAR that is not set. In general use select only for - non-visible symbols (no prompts anywhere) and for symbols with - no dependencies. That will limit the usefulness but on the - other hand avoid the illegal configurations all over. kconfig - should one day warn about such things. + select should be used with care. select will force + a symbol to a value without visiting the dependencies. + By abusing select you are able to select a symbol FOO even + if FOO depends on BAR that is not set. + In general use select only for non-visible symbols + (no prompts anywhere) and for symbols with no dependencies. + That will limit the usefulness but on the other hand avoid + the illegal configurations all over. + kconfig should one day warn about such things. - numerical ranges: "range" <symbol> <symbol> ["if" <expr>] This allows to limit the range of possible input values for int diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index d0ac72cc19f..b8e52c0355d 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -245,6 +245,8 @@ The syntax is: crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset] range=start-[end] + 'start' is inclusive and 'end' is exclusive. + For example: crashkernel=512M-2G:64M,2G-:128M @@ -253,10 +255,11 @@ This would mean: 1) if the RAM is smaller than 512M, then don't reserve anything (this is the "rescue" case) - 2) if the RAM size is between 512M and 2G, then reserve 64M + 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M 3) if the RAM size is larger than 2G, then reserve 128M + Boot into System Kernel ======================= diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e5f3d918316..a3c35446e75 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -496,6 +496,11 @@ and is between 256 and 4096 characters. It is defined in the file switching to the matching ttyS device later. The options are the same as for ttyS, above. + If the device connected to the port is not a TTY but a braille + device, prepend "brl," before the device type, for instance + console=brl,ttyS0 + For now, only VisioBraille is supported. + earlycon= [KNL] Output early console device and options. uart[8250],io,<addr>[,options] uart[8250],mmio,<addr>[,options] @@ -556,6 +561,8 @@ and is between 256 and 4096 characters. It is defined in the file 1 will print _a lot_ more information - normally only useful to kernel developers. + debug_objects [KNL] Enable object debugging + decnet.addr= [HW,NET] Format: <area>[,<node>] See also Documentation/networking/decnet.txt. @@ -627,8 +634,7 @@ and is between 256 and 4096 characters. It is defined in the file eata= [HW,SCSI] edd= [EDD] - Format: {"of[f]" | "sk[ipmbr]"} - See comment in arch/i386/boot/edd.S + Format: {"off" | "on" | "skip[mbr]"} eisa_irq_edge= [PARISC,HW] See header of drivers/parisc/eisa.c. @@ -1389,6 +1395,13 @@ and is between 256 and 4096 characters. It is defined in the file nr_uarts= [SERIAL] maximum number of UARTs to be registered. + olpc_ec_timeout= [OLPC] ms delay when issuing EC commands + Rather than timing out after 20 ms if an EC + command is not properly ACKed, override the length + of the timeout. We have interrupts disabled while + waiting for the ACK, so if this is set too high + interrupts *may* be lost! + opl3= [HW,OSS] Format: <io> diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt index 266955d23ee..09b55e46174 100644 --- a/Documentation/keys-request-key.txt +++ b/Documentation/keys-request-key.txt @@ -11,26 +11,29 @@ request_key*(): struct key *request_key(const struct key_type *type, const char *description, - const char *callout_string); + const char *callout_info); or: struct key *request_key_with_auxdata(const struct key_type *type, const char *description, - const char *callout_string, + const char *callout_info, + size_t callout_len, void *aux); or: struct key *request_key_async(const struct key_type *type, const char *description, - const char *callout_string); + const char *callout_info, + size_t callout_len); or: struct key *request_key_async_with_auxdata(const struct key_type *type, const char *description, - const char *callout_string, + const char *callout_info, + size_t callout_len, void *aux); Or by userspace invoking the request_key system call: diff --git a/Documentation/keys.txt b/Documentation/keys.txt index 51652d39e61..d5c7a57d170 100644 --- a/Documentation/keys.txt +++ b/Documentation/keys.txt @@ -170,7 +170,8 @@ The key service provides a number of features besides keys: amount of description and payload space that can be consumed. The user can view information on this and other statistics through procfs - files. + files. The root user may also alter the quota limits through sysctl files + (see the section "New procfs files"). Process-specific and thread-specific keyrings are not counted towards a user's quota. @@ -329,6 +330,27 @@ about the status of the key service: <bytes>/<max> Key size quota +Four new sysctl files have been added also for the purpose of controlling the +quota limits on keys: + + (*) /proc/sys/kernel/keys/root_maxkeys + /proc/sys/kernel/keys/root_maxbytes + + These files hold the maximum number of keys that root may have and the + maximum total number of bytes of data that root may have stored in those + keys. + + (*) /proc/sys/kernel/keys/maxkeys + /proc/sys/kernel/keys/maxbytes + + These files hold the maximum number of keys that each non-root user may + have and the maximum total number of bytes of data that each of those + users may have stored in their keys. + +Root may alter these by writing each new limit as a decimal number string to +the appropriate file. + + =============================== USERSPACE SYSTEM CALL INTERFACE =============================== @@ -711,6 +733,27 @@ The keyctl syscall functions are: The assumed authoritative key is inherited across fork and exec. + (*) Get the LSM security context attached to a key. + + long keyctl(KEYCTL_GET_SECURITY, key_serial_t key, char *buffer, + size_t buflen) + + This function returns a string that represents the LSM security context + attached to a key in the buffer provided. + + Unless there's an error, it always returns the amount of data it could + produce, even if that's too big for the buffer, but it won't copy more + than requested to userspace. If the buffer pointer is NULL then no copy + will take place. + + A NUL character is included at the end of the string if the buffer is + sufficiently big. This is included in the returned count. If no LSM is + in force then an empty string will be returned. + + A process must have view permission on the key for this function to be + successful. + + =============== KERNEL SERVICES =============== @@ -771,7 +814,7 @@ payload contents" for more information. struct key *request_key(const struct key_type *type, const char *description, - const char *callout_string); + const char *callout_info); This is used to request a key or keyring with a description that matches the description specified according to the key type's match function. This @@ -793,24 +836,28 @@ payload contents" for more information. struct key *request_key_with_auxdata(const struct key_type *type, const char *description, - const char *callout_string, + const void *callout_info, + size_t callout_len, void *aux); This is identical to request_key(), except that the auxiliary data is - passed to the key_type->request_key() op if it exists. + passed to the key_type->request_key() op if it exists, and the callout_info + is a blob of length callout_len, if given (the length may be 0). (*) A key can be requested asynchronously by calling one of: struct key *request_key_async(const struct key_type *type, const char *description, - const char *callout_string); + const void *callout_info, + size_t callout_len); or: struct key *request_key_async_with_auxdata(const struct key_type *type, const char *description, - const char *callout_string, + const char *callout_info, + size_t callout_len, void *aux); which are asynchronous equivalents of request_key() and diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index be89f393274..6877e718711 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -37,6 +37,11 @@ registration function such as register_kprobe() specifies where the probe is to be inserted and what handler is to be called when the probe is hit. +There are also register_/unregister_*probes() functions for batch +registration/unregistration of a group of *probes. These functions +can speed up unregistration process when you have to unregister +a lot of probes at once. + The next three subsections explain how the different types of probes work. They explain certain things that you'll need to know in order to make the best use of Kprobes -- e.g., the @@ -190,10 +195,11 @@ code mapping. 4. API Reference The Kprobes API includes a "register" function and an "unregister" -function for each type of probe. Here are terse, mini-man-page -specifications for these functions and the associated probe handlers -that you'll write. See the files in the samples/kprobes/ sub-directory -for examples. +function for each type of probe. The API also includes "register_*probes" +and "unregister_*probes" functions for (un)registering arrays of probes. +Here are terse, mini-man-page specifications for these functions and +the associated probe handlers that you'll write. See the files in the +samples/kprobes/ sub-directory for examples. 4.1 register_kprobe @@ -319,6 +325,43 @@ void unregister_kretprobe(struct kretprobe *rp); Removes the specified probe. The unregister function can be called at any time after the probe has been registered. +NOTE: +If the functions find an incorrect probe (ex. an unregistered probe), +they clear the addr field of the probe. + +4.5 register_*probes + +#include <linux/kprobes.h> +int register_kprobes(struct kprobe **kps, int num); +int register_kretprobes(struct kretprobe **rps, int num); +int register_jprobes(struct jprobe **jps, int num); + +Registers each of the num probes in the specified array. If any +error occurs during registration, all probes in the array, up to +the bad probe, are safely unregistered before the register_*probes +function returns. +- kps/rps/jps: an array of pointers to *probe data structures +- num: the number of the array entries. + +NOTE: +You have to allocate(or define) an array of pointers and set all +of the array entries before using these functions. + +4.6 unregister_*probes + +#include <linux/kprobes.h> +void unregister_kprobes(struct kprobe **kps, int num); +void unregister_kretprobes(struct kretprobe **rps, int num); +void unregister_jprobes(struct jprobe **jps, int num); + +Removes each of the num probes in the specified array at once. + +NOTE: +If the functions find some incorrect probes (ex. unregistered +probes) in the specified array, they clear the addr field of those +incorrect probes. However, other probes in the array are +unregistered correctly. + 5. Kprobes Features and Limitations Kprobes allows multiple probes at the same address. Currently, diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt index 76cb428435d..01c6c3d8a7e 100644 --- a/Documentation/laptops/thinkpad-acpi.txt +++ b/Documentation/laptops/thinkpad-acpi.txt @@ -1,7 +1,7 @@ ThinkPad ACPI Extras Driver - Version 0.19 - January 06th, 2008 + Version 0.20 + April 09th, 2008 Borislav Deianov <borislav@users.sf.net> Henrique de Moraes Holschuh <hmh@hmh.eng.br> @@ -18,6 +18,11 @@ This driver used to be named ibm-acpi until kernel 2.6.21 and release moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel 2.6.22, and release 0.14. +The driver is named "thinkpad-acpi". In some places, like module +names, "thinkpad_acpi" is used because of userspace issues. + +"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too +long due to length limitations on some Linux kernel versions. Status ------ @@ -571,6 +576,47 @@ netlink interface and the input layer interface, and don't bother at all with hotkey_report_mode. +Brightness hotkey notes: + +These are the current sane choices for brightness key mapping in +thinkpad-acpi: + +For IBM and Lenovo models *without* ACPI backlight control (the ones on +which thinkpad-acpi will autoload its backlight interface by default, +and on which ACPI video does not export a backlight interface): + +1. Don't enable or map the brightness hotkeys in thinkpad-acpi, as + these older firmware versions unfortunately won't respect the hotkey + mask for brightness keys anyway, and always reacts to them. This + usually work fine, unless X.org drivers are doing something to block + the BIOS. In that case, use (3) below. This is the default mode of + operation. + +2. Enable the hotkeys, but map them to something else that is NOT + KEY_BRIGHTNESS_UP/DOWN or any other keycode that would cause + userspace to try to change the backlight level, and use that as an + on-screen-display hint. + +3. IF AND ONLY IF X.org drivers find a way to block the firmware from + automatically changing the brightness, enable the hotkeys and map + them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN, and feed that to + something that calls xbacklight. thinkpad-acpi will not be able to + change brightness in that case either, so you should disable its + backlight interface. + +For Lenovo models *with* ACPI backlight control: + +1. Load up ACPI video and use that. ACPI video will report ACPI + events for brightness change keys. Do not mess with thinkpad-acpi + defaults in this case. thinkpad-acpi should not have anything to do + with backlight events in a scenario where ACPI video is loaded: + brightness hotkeys must be disabled, and the backlight interface is + to be kept disabled as well. This is the default mode of operation. + +2. Do *NOT* load up ACPI video, enable the hotkeys in thinkpad-acpi, + and map them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN. Process + these keys on userspace somehow (e.g. by calling xbacklight). + Bluetooth --------- @@ -647,16 +693,31 @@ while others are still having problems. For more information: https://bugs.freedesktop.org/show_bug.cgi?id=2000 -ThinkLight control -- /proc/acpi/ibm/light ------------------------------------------- +ThinkLight control +------------------ + +procfs: /proc/acpi/ibm/light +sysfs attributes: as per LED class, for the "tpacpi::thinklight" LED -The current status of the ThinkLight can be found in this file. A few -models which do not make the status available will show it as -"unknown". The available commands are: +procfs notes: + +The ThinkLight status can be read and set through the procfs interface. A +few models which do not make the status available will show the ThinkLight +status as "unknown". The available commands are: echo on > /proc/acpi/ibm/light echo off > /proc/acpi/ibm/light +sysfs notes: + +The ThinkLight sysfs interface is documented by the LED class +documentation, in Documentation/leds-class.txt. The ThinkLight LED name +is "tpacpi::thinklight". + +Due to limitations in the sysfs LED class, if the status of the thinklight +cannot be read or if it is unknown, thinkpad-acpi will report it as "off". +It is impossible to know if the status returned through sysfs is valid. + Docking / undocking -- /proc/acpi/ibm/dock ------------------------------------------ @@ -815,28 +876,63 @@ The cmos command interface is prone to firmware split-brain problems, as in newer ThinkPads it is just a compatibility layer. Do not use it, it is exported just as a debug tool. -LED control -- /proc/acpi/ibm/led ---------------------------------- +LED control +----------- + +procfs: /proc/acpi/ibm/led +sysfs attributes: as per LED class, see below for names + +Some of the LED indicators can be controlled through this feature. On +some older ThinkPad models, it is possible to query the status of the +LED indicators as well. Newer ThinkPads cannot query the real status +of the LED indicators. -Some of the LED indicators can be controlled through this feature. The -available commands are: +procfs notes: + +The available commands are: - echo '<led number> on' >/proc/acpi/ibm/led - echo '<led number> off' >/proc/acpi/ibm/led - echo '<led number> blink' >/proc/acpi/ibm/led + echo '<LED number> on' >/proc/acpi/ibm/led + echo '<LED number> off' >/proc/acpi/ibm/led + echo '<LED number> blink' >/proc/acpi/ibm/led -The <led number> range is 0 to 7. The set of LEDs that can be -controlled varies from model to model. Here is the mapping on the X40: +The <LED number> range is 0 to 7. The set of LEDs that can be +controlled varies from model to model. Here is the common ThinkPad +mapping: 0 - power 1 - battery (orange) 2 - battery (green) - 3 - UltraBase + 3 - UltraBase/dock 4 - UltraBay + 5 - UltraBase battery slot + 6 - (unknown) 7 - standby All of the above can be turned on and off and can be made to blink. +sysfs notes: + +The ThinkPad LED sysfs interface is described in detail by the LED class +documentation, in Documentation/leds-class.txt. + +The leds are named (in LED ID order, from 0 to 7): +"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt", +"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt", +"tpacpi::unknown_led", "tpacpi::standby". + +Due to limitations in the sysfs LED class, if the status of the LED +indicators cannot be read due to an error, thinkpad-acpi will report it as +a brightness of zero (same as LED off). + +If the thinkpad firmware doesn't support reading the current status, +trying to read the current LED brightness will just return whatever +brightness was last written to that attribute. + +These LEDs can blink using hardware acceleration. To request that a +ThinkPad indicator LED should blink in hardware accelerated mode, use the +"timer" trigger, and leave the delay_on and delay_off parameters set to +zero (to request hardware acceleration autodetection). + ACPI sounds -- /proc/acpi/ibm/beep ---------------------------------- @@ -1090,6 +1186,15 @@ it there will be the following attributes: dim the display. +WARNING: + + Whatever you do, do NOT ever call thinkpad-acpi backlight-level change + interface and the ACPI-based backlight level change interface + (available on newer BIOSes, and driven by the Linux ACPI video driver) + at the same time. The two will interact in bad ways, do funny things, + and maybe reduce the life of the backlight lamps by needlessly kicking + its level up and down at every change. + Volume control -- /proc/acpi/ibm/volume --------------------------------------- diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 4c1fc65a8b3..3be8ab2a886 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -131,6 +131,9 @@ struct device /* Any queues attached to this device */ struct virtqueue *vq; + /* Handle status being finalized (ie. feature bits stable). */ + void (*ready)(struct device *me); + /* Device-specific data. */ void *priv; }; @@ -925,24 +928,40 @@ static void enable_fd(int fd, struct virtqueue *vq) write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); } -/* When the Guest asks us to reset a device, it's is fairly easy. */ -static void reset_device(struct device *dev) +/* When the Guest tells us they updated the status field, we handle it. */ +static void update_device_status(struct device *dev) { struct virtqueue *vq; - verbose("Resetting device %s\n", dev->name); - /* Clear the status. */ - dev->desc->status = 0; + /* This is a reset. */ + if (dev->desc->status == 0) { + verbose("Resetting device %s\n", dev->name); - /* Clear any features they've acked. */ - memset(get_feature_bits(dev) + dev->desc->feature_len, 0, - dev->desc->feature_len); + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->desc->feature_len, 0, + dev->desc->feature_len); - /* Zero out the virtqueues. */ - for (vq = dev->vq; vq; vq = vq->next) { - memset(vq->vring.desc, 0, - vring_size(vq->config.num, getpagesize())); - vq->last_avail_idx = 0; + /* Zero out the virtqueues. */ + for (vq = dev->vq; vq; vq = vq->next) { + memset(vq->vring.desc, 0, + vring_size(vq->config.num, getpagesize())); + vq->last_avail_idx = 0; + } + } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { + warnx("Device %s configuration FAILED", dev->name); + } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { + unsigned int i; + + verbose("Device %s OK: offered", dev->name); + for (i = 0; i < dev->desc->feature_len; i++) + verbose(" %08x", get_feature_bits(dev)[i]); + verbose(", accepted"); + for (i = 0; i < dev->desc->feature_len; i++) + verbose(" %08x", get_feature_bits(dev) + [dev->desc->feature_len+i]); + + if (dev->ready) + dev->ready(dev); } } @@ -954,9 +973,9 @@ static void handle_output(int fd, unsigned long addr) /* Check each device and virtqueue. */ for (i = devices.dev; i; i = i->next) { - /* Notifications to device descriptors reset the device. */ + /* Notifications to device descriptors update device status. */ if (from_guest_phys(addr) == i->desc) { - reset_device(i); + update_device_status(i); return; } @@ -1170,6 +1189,7 @@ static struct device *new_device(const char *name, u16 type, int fd, dev->handle_input = handle_input; dev->name = name; dev->vq = NULL; + dev->ready = NULL; /* Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus @@ -1398,7 +1418,7 @@ static bool service_io(struct device *dev) struct vblk_info *vblk = dev->priv; unsigned int head, out_num, in_num, wlen; int ret; - struct virtio_blk_inhdr *in; + u8 *in; struct virtio_blk_outhdr *out; struct iovec iov[dev->vq->vring.num]; off64_t off; @@ -1416,7 +1436,7 @@ static bool service_io(struct device *dev) head, out_num, in_num); out = convert(&iov[0], struct virtio_blk_outhdr); - in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); + in = convert(&iov[out_num+in_num-1], u8); off = out->sector * 512; /* The block device implements "barriers", where the Guest indicates @@ -1430,7 +1450,7 @@ static bool service_io(struct device *dev) * It'd be nice if we supported eject, for example, but we don't. */ if (out->type & VIRTIO_BLK_T_SCSI_CMD) { fprintf(stderr, "Scsi commands unsupported\n"); - in->status = VIRTIO_BLK_S_UNSUPP; + *in = VIRTIO_BLK_S_UNSUPP; wlen = sizeof(*in); } else if (out->type & VIRTIO_BLK_T_OUT) { /* Write */ @@ -1453,7 +1473,7 @@ static bool service_io(struct device *dev) errx(1, "Write past end %llu+%u", off, ret); } wlen = sizeof(*in); - in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); + *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else { /* Read */ @@ -1466,10 +1486,10 @@ static bool service_io(struct device *dev) verbose("READ from sector %llu: %i\n", out->sector, ret); if (ret >= 0) { wlen = sizeof(*in) + ret; - in->status = VIRTIO_BLK_S_OK; + *in = VIRTIO_BLK_S_OK; } else { wlen = sizeof(*in); - in->status = VIRTIO_BLK_S_IOERR; + *in = VIRTIO_BLK_S_IOERR; } } diff --git a/Documentation/md.txt b/Documentation/md.txt index 396cdd982c2..a8b43062747 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -450,3 +450,9 @@ These currently include there are upper and lower limits (32768, 16). Default is 128. strip_cache_active (currently raid5 only) number of active entries in the stripe cache + preread_bypass_threshold (currently raid5 only) + number of times a stripe requiring preread will be bypassed by + a stripe that does not require preread. For fairness defaults + to 1. Setting this to 0 disables bypass accounting and + requires preread stripes to wait until all full-width stripe- + writes are complete. Valid values are 0 to stripe_cache_size. diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index 7f60dfe642c..b152e81da59 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt @@ -253,6 +253,10 @@ characters, each representing a particular tainted value. 8: 'D' if the kernel has died recently, i.e. there was an OOPS or BUG. + 9: 'A' if the ACPI table has been overridden. + + 10: 'W' if a warning has previously been issued by the kernel. + The primary reason for the 'Tainted: ' string is to tell kernel debuggers if this is a clean kernel or if anything unusual has occurred. Tainting is permanent: even if an offending module is diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt index cf89e8cfd5b..1d2a772506c 100644 --- a/Documentation/powerpc/booting-without-of.txt +++ b/Documentation/powerpc/booting-without-of.txt @@ -2836,6 +2836,39 @@ platforms are moved over to use the flattened-device-tree model. big-endian; }; + r) Freescale Display Interface Unit + + The Freescale DIU is a LCD controller, with proper hardware, it can also + drive DVI monitors. + + Required properties: + - compatible : should be "fsl-diu". + - reg : should contain at least address and length of the DIU register + set. + - Interrupts : one DIU interrupt should be describe here. + + Example (MPC8610HPCD) + display@2c000 { + compatible = "fsl,diu"; + reg = <0x2c000 100>; + interrupts = <72 2>; + interrupt-parent = <&mpic>; + }; + + s) Freescale on board FPGA + + This is the memory-mapped registers for on board FPGA. + + Required properities: + - compatible : should be "fsl,fpga-pixis". + - reg : should contain the address and the lenght of the FPPGA register + set. + + Example (MPC8610HPCD) + board-control@e8000000 { + compatible = "fsl,fpga-pixis"; + reg = <0xe8000000 32>; + }; VII - Marvell Discovery mv64[345]6x System Controller chips =========================================================== diff --git a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt index 5e03610e186..cda7a7dffa6 100644 --- a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt +++ b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt @@ -186,6 +186,12 @@ Recommended soc5200 child nodes; populate as needed for your board name device_type compatible Description ---- ----------- ---------- ----------- gpt@<addr> gpt fsl,mpc5200-gpt General purpose timers +gpt@<addr> gpt fsl,mpc5200-gpt-gpio General purpose + timers in GPIO mode +gpio@<addr> fsl,mpc5200-gpio MPC5200 simple gpio + controller +gpio@<addr> fsl,mpc5200-gpio-wkup MPC5200 wakeup gpio + controller rtc@<addr> rtc mpc5200-rtc Real time clock mscan@<addr> mscan mpc5200-mscan CAN bus controller pci@<addr> pci mpc5200-pci PCI bridge @@ -225,6 +231,12 @@ PSC in i2s mode: The mpc5200 and mpc5200b PSCs are not compatible when in i2s mode. An 'mpc5200b-psc-i2s' node cannot include 'mpc5200-psc-i2s' in the compatible field. +7) GPIO controller nodes +Each GPIO controller node should have the empty property gpio-controller and +#gpio-cells set to 2. First cell is the GPIO number which is interpreted +according to the bit numbers in the GPIO control registers. The second cell +is for flags which is currently unsused. + IV - Extra Notes ================ diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index fd4c32a031c..0bbee38acd2 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt @@ -795,6 +795,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. lg-lw LG LW20/LW25 laptop tcl TCL S700 clevo Clevo laptops (m520G, m665n) + medion Medion Rim 2150 test for testing/debugging purpose, almost all controls can be adjusted. Appearing only when compiled with $CONFIG_SND_DEBUG=y diff --git a/Documentation/spi/spidev b/Documentation/spi/spidev index 5c8e1b988a0..ed2da5e5b28 100644 --- a/Documentation/spi/spidev +++ b/Documentation/spi/spidev @@ -126,8 +126,8 @@ NOTES: FULL DUPLEX CHARACTER DEVICE API ================================ -See the sample program below for one example showing the use of the full -duplex programming interface. (Although it doesn't perform a full duplex +See the spidev_fdx.c sample program for one example showing the use of the +full duplex programming interface. (Although it doesn't perform a full duplex transfer.) The model is the same as that used in the kernel spi_sync() request; the individual transfers offer the same capabilities as are available to kernel drivers (except that it's not asynchronous). @@ -141,167 +141,3 @@ and bitrate for each transfer segment.) To make a full duplex request, provide both rx_buf and tx_buf for the same transfer. It's even OK if those are the same buffer. - - -SAMPLE PROGRAM -============== - --------------------------------- CUT HERE -#include <stdio.h> -#include <unistd.h> -#include <stdlib.h> -#include <fcntl.h> -#include <string.h> - -#include <sys/ioctl.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include <linux/types.h> -#include <linux/spi/spidev.h> - - -static int verbose; - -static void do_read(int fd, int len) -{ - unsigned char buf[32], *bp; - int status; - - /* read at least 2 bytes, no more than 32 */ - if (len < 2) - len = 2; - else if (len > sizeof(buf)) - len = sizeof(buf); - memset(buf, 0, sizeof buf); - - status = read(fd, buf, len); - if (status < 0) { - perror("read"); - return; - } - if (status != len) { - fprintf(stderr, "short read\n"); - return; - } - - printf("read(%2d, %2d): %02x %02x,", len, status, - buf[0], buf[1]); - status -= 2; - bp = buf + 2; - while (status-- > 0) - printf(" %02x", *bp++); - printf("\n"); -} - -static void do_msg(int fd, int len) -{ - struct spi_ioc_transfer xfer[2]; - unsigned char buf[32], *bp; - int status; - - memset(xfer, 0, sizeof xfer); - memset(buf, 0, sizeof buf); - - if (len > sizeof buf) - len = sizeof buf; - - buf[0] = 0xaa; - xfer[0].tx_buf = (__u64) buf; - xfer[0].len = 1; - - xfer[1].rx_buf = (__u64) buf; - xfer[1].len = len; - - status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); - if (status < 0) { - perror("SPI_IOC_MESSAGE"); - return; - } - - printf("response(%2d, %2d): ", len, status); - for (bp = buf; len; len--) - printf(" %02x", *bp++); - printf("\n"); -} - -static void dumpstat(const char *name, int fd) -{ - __u8 mode, lsb, bits; - __u32 speed; - - if (ioctl(fd, SPI_IOC_RD_MODE, &mode) < 0) { - perror("SPI rd_mode"); - return; - } - if (ioctl(fd, SPI_IOC_RD_LSB_FIRST, &lsb) < 0) { - perror("SPI rd_lsb_fist"); - return; - } - if (ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits) < 0) { - perror("SPI bits_per_word"); - return; - } - if (ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed) < 0) { - perror("SPI max_speed_hz"); - return; - } - - printf("%s: spi mode %d, %d bits %sper word, %d Hz max\n", - name, mode, bits, lsb ? "(lsb first) " : "", speed); -} - -int main(int argc, char **argv) -{ - int c; - int readcount = 0; - int msglen = 0; - int fd; - const char *name; - - while ((c = getopt(argc, argv, "hm:r:v")) != EOF) { - switch (c) { - case 'm': - msglen = atoi(optarg); - if (msglen < 0) - goto usage; - continue; - case 'r': - readcount = atoi(optarg); - if (readcount < 0) - goto usage; - continue; - case 'v': - verbose++; - continue; - case 'h': - case '?': -usage: - fprintf(stderr, - "usage: %s [-h] [-m N] [-r N] /dev/spidevB.D\n", - argv[0]); - return 1; - } - } - - if ((optind + 1) != argc) - goto usage; - name = argv[optind]; - - fd = open(name, O_RDWR); - if (fd < 0) { - perror("open"); - return 1; - } - - dumpstat(name, fd); - - if (msglen) - do_msg(fd, msglen); - - if (readcount) - do_read(fd, readcount); - - close(fd); - return 0; -} diff --git a/Documentation/spi/spidev_fdx.c b/Documentation/spi/spidev_fdx.c new file mode 100644 index 00000000000..fc354f76038 --- /dev/null +++ b/Documentation/spi/spidev_fdx.c @@ -0,0 +1,158 @@ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <fcntl.h> +#include <string.h> + +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include <linux/types.h> +#include <linux/spi/spidev.h> + + +static int verbose; + +static void do_read(int fd, int len) +{ + unsigned char buf[32], *bp; + int status; + + /* read at least 2 bytes, no more than 32 */ + if (len < 2) + len = 2; + else if (len > sizeof(buf)) + len = sizeof(buf); + memset(buf, 0, sizeof buf); + + status = read(fd, buf, len); + if (status < 0) { + perror("read"); + return; + } + if (status != len) { + fprintf(stderr, "short read\n"); + return; + } + + printf("read(%2d, %2d): %02x %02x,", len, status, + buf[0], buf[1]); + status -= 2; + bp = buf + 2; + while (status-- > 0) + printf(" %02x", *bp++); + printf("\n"); +} + +static void do_msg(int fd, int len) +{ + struct spi_ioc_transfer xfer[2]; + unsigned char buf[32], *bp; + int status; + + memset(xfer, 0, sizeof xfer); + memset(buf, 0, sizeof buf); + + if (len > sizeof buf) + len = sizeof buf; + + buf[0] = 0xaa; + xfer[0].tx_buf = (__u64) buf; + xfer[0].len = 1; + + xfer[1].rx_buf = (__u64) buf; + xfer[1].len = len; + + status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); + if (status < 0) { + perror("SPI_IOC_MESSAGE"); + return; + } + + printf("response(%2d, %2d): ", len, status); + for (bp = buf; len; len--) + printf(" %02x", *bp++); + printf("\n"); +} + +static void dumpstat(const char *name, int fd) +{ + __u8 mode, lsb, bits; + __u32 speed; + + if (ioctl(fd, SPI_IOC_RD_MODE, &mode) < 0) { + perror("SPI rd_mode"); + return; + } + if (ioctl(fd, SPI_IOC_RD_LSB_FIRST, &lsb) < 0) { + perror("SPI rd_lsb_fist"); + return; + } + if (ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits) < 0) { + perror("SPI bits_per_word"); + return; + } + if (ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed) < 0) { + perror("SPI max_speed_hz"); + return; + } + + printf("%s: spi mode %d, %d bits %sper word, %d Hz max\n", + name, mode, bits, lsb ? "(lsb first) " : "", speed); +} + +int main(int argc, char **argv) +{ + int c; + int readcount = 0; + int msglen = 0; + int fd; + const char *name; + + while ((c = getopt(argc, argv, "hm:r:v")) != EOF) { + switch (c) { + case 'm': + msglen = atoi(optarg); + if (msglen < 0) + goto usage; + continue; + case 'r': + readcount = atoi(optarg); + if (readcount < 0) + goto usage; + continue; + case 'v': + verbose++; + continue; + case 'h': + case '?': +usage: + fprintf(stderr, + "usage: %s [-h] [-m N] [-r N] /dev/spidevB.D\n", + argv[0]); + return 1; + } + } + + if ((optind + 1) != argc) + goto usage; + name = argv[optind]; + + fd = open(name, O_RDWR); + if (fd < 0) { + perror("open"); + return 1; + } + + dumpstat(name, fd); + + if (msglen) + do_msg(fd, msglen); + + if (readcount) + do_read(fd, readcount); + + close(fd); + return 0; +} diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 10c8f6922ef..5ce0952aa06 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -85,6 +85,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'k' - Secure Access Key (SAK) Kills all programs on the current virtual console. NOTE: See important comments below in SAK section. +'l' - Shows a stack backtrace for all active CPUs. + 'm' - Will dump current memory info to your console. 'n' - Used to make RT tasks nice-able diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index d9f28be7540..70d68ce8640 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -108,10 +108,12 @@ and throttle appropriate devices. RO read only value RW read/write value -All thermal sysfs attributes will be represented under /sys/class/thermal +Thermal sysfs attributes will be represented under /sys/class/thermal. +Hwmon sysfs I/F extension is also available under /sys/class/hwmon +if hwmon is compiled in or built as a module. Thermal zone device sys I/F, created once it's registered: -|thermal_zone[0-*]: +/sys/class/thermal/thermal_zone[0-*]: |-----type: Type of the thermal zone |-----temp: Current temperature |-----mode: Working mode of the thermal zone @@ -119,7 +121,7 @@ Thermal zone device sys I/F, created once it's registered: |-----trip_point_[0-*]_type: Trip point type Thermal cooling device sys I/F, created once it's registered: -|cooling_device[0-*]: +/sys/class/thermal/cooling_device[0-*]: |-----type : Type of the cooling device(processor/fan/...) |-----max_state: Maximum cooling state of the cooling device |-----cur_state: Current cooling state of the cooling device @@ -130,10 +132,19 @@ They represent the relationship between a thermal zone and its associated coolin They are created/removed for each thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device successful execution. -|thermal_zone[0-*] +/sys/class/thermal/thermal_zone[0-*] |-----cdev[0-*]: The [0-*]th cooling device in the current thermal zone |-----cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with +Besides the thermal zone device sysfs I/F and cooling device sysfs I/F, +the generic thermal driver also creates a hwmon sysfs I/F for each _type_ of +thermal zone device. E.g. the generic thermal driver registers one hwmon class device +and build the associated hwmon sysfs I/F for all the registered ACPI thermal zones. +/sys/class/hwmon/hwmon[0-*]: + |-----name: The type of the thermal zone devices. + |-----temp[1-*]_input: The current temperature of thermal zone [1-*]. + |-----temp[1-*]_critical: The critical trip point of thermal zone [1-*]. +Please read Documentation/hwmon/sysfs-interface for additional information. *************************** * Thermal zone attributes * @@ -141,7 +152,10 @@ thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device successful e type Strings which represent the thermal zone type. This is given by thermal zone driver as part of registration. - Eg: "ACPI thermal zone" indicates it's a ACPI thermal device + Eg: "acpitz" indicates it's an ACPI thermal device. + In order to keep it consistent with hwmon sys attribute, + this should be a short, lowercase string, + not containing spaces nor dashes. RO Required @@ -218,7 +232,7 @@ the sys I/F structure will be built like this: /sys/class/thermal: |thermal_zone1: - |-----type: ACPI thermal zone + |-----type: acpitz |-----temp: 37000 |-----mode: kernel |-----trip_point_0_temp: 100000 @@ -243,3 +257,10 @@ the sys I/F structure will be built like this: |-----type: Fan |-----max_state: 2 |-----cur_state: 0 + +/sys/class/hwmon: + +|hwmon0: + |-----name: acpitz + |-----temp1_input: 37000 + |-----temp1_crit: 100000 diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134 index 44d84dd15ad..67937df1e97 100644 --- a/Documentation/video4linux/CARDLIST.saa7134 +++ b/Documentation/video4linux/CARDLIST.saa7134 @@ -128,7 +128,7 @@ 127 -> Beholder BeholdTV 507 FM/RDS / BeholdTV 509 FM [0000:5071,0000:507B,5ace:5070,5ace:5090] 128 -> Beholder BeholdTV Columbus TVFM [0000:5201] 129 -> Beholder BeholdTV 607 / BeholdTV 609 [5ace:6070,5ace:6071,5ace:6072,5ace:6073,5ace:6090,5ace:6091,5ace:6092,5ace:6093] -130 -> Beholder BeholdTV M6 / BeholdTV M6 Extra [5ace:6190,5ace:6193] +130 -> Beholder BeholdTV M6 / BeholdTV M6 Extra [5ace:6190,5ace:6193,5ace:6191] 131 -> Twinhan Hybrid DTV-DVB 3056 PCI [1822:0022] 132 -> Genius TVGO AM11MCE 133 -> NXP Snake DVB-S reference design @@ -140,3 +140,4 @@ 139 -> Compro VideoMate T750 [185b:c900] 140 -> Avermedia DVB-S Pro A700 [1461:a7a1] 141 -> Avermedia DVB-S Hybrid+FM A700 [1461:a7a2] +142 -> Beholder BeholdTV H6 [5ace:6290] diff --git a/Documentation/video4linux/cx18.txt b/Documentation/video4linux/cx18.txt new file mode 100644 index 00000000000..077d56ec3f3 --- /dev/null +++ b/Documentation/video4linux/cx18.txt @@ -0,0 +1,34 @@ +Some notes regarding the cx18 driver for the Conexant CX23418 MPEG +encoder chip: + +1) The only hardware currently supported is the Hauppauge HVR-1600. + +2) Some people have problems getting the i2c bus to work. Cause unknown. + The symptom is that the eeprom cannot be read and the card is + unusable. + +3) The audio from the analog tuner is mono only. Probably caused by + incorrect audio register information in the datasheet. We are + waiting for updated information from Conexant. + +4) VBI (raw or sliced) has not yet been implemented. + +5) MPEG indexing is not yet implemented. + +6) The driver is still a bit rough around the edges, this should + improve over time. + + +Firmware: + +The firmware needs to be extracted from the Windows Hauppauge HVR-1600 +driver, available here: + +http://hauppauge.lightpath.net/software/install_cd/hauppauge_cd_3.4d1.zip + +Unzip, then copy the following files to the firmware directory +and rename them as follows: + +Drivers/Driver18/hcw18apu.rom -> v4l-cx23418-apu.fw +Drivers/Driver18/hcw18enc.rom -> v4l-cx23418-cpu.fw +Drivers/Driver18/hcw18mlC.rom -> v4l-cx23418-dig.fw diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt index dd498649799..bad16d3f6a4 100644 --- a/Documentation/vm/numa_memory_policy.txt +++ b/Documentation/vm/numa_memory_policy.txt @@ -135,77 +135,58 @@ most general to most specific: Components of Memory Policies - A Linux memory policy is a tuple consisting of a "mode" and an optional set - of nodes. The mode determine the behavior of the policy, while the - optional set of nodes can be viewed as the arguments to the behavior. + A Linux memory policy consists of a "mode", optional mode flags, and an + optional set of nodes. The mode determines the behavior of the policy, + the optional mode flags determine the behavior of the mode, and the + optional set of nodes can be viewed as the arguments to the policy + behavior. Internally, memory policies are implemented by a reference counted structure, struct mempolicy. Details of this structure will be discussed in context, below, as required to explain the behavior. - Note: in some functions AND in the struct mempolicy itself, the mode - is called "policy". However, to avoid confusion with the policy tuple, - this document will continue to use the term "mode". - Linux memory policy supports the following 4 behavioral modes: - Default Mode--MPOL_DEFAULT: The behavior specified by this mode is - context or scope dependent. - - As mentioned in the Policy Scope section above, during normal - system operation, the System Default Policy is hard coded to - contain the Default mode. - - In this context, default mode means "local" allocation--that is - attempt to allocate the page from the node associated with the cpu - where the fault occurs. If the "local" node has no memory, or the - node's memory can be exhausted [no free pages available], local - allocation will "fallback to"--attempt to allocate pages from-- - "nearby" nodes, in order of increasing "distance". + Default Mode--MPOL_DEFAULT: This mode is only used in the memory + policy APIs. Internally, MPOL_DEFAULT is converted to the NULL + memory policy in all policy scopes. Any existing non-default policy + will simply be removed when MPOL_DEFAULT is specified. As a result, + MPOL_DEFAULT means "fall back to the next most specific policy scope." - Implementation detail -- subject to change: "Fallback" uses - a per node list of sibling nodes--called zonelists--built at - boot time, or when nodes or memory are added or removed from - the system [memory hotplug]. These per node zonelist are - constructed with nodes in order of increasing distance based - on information provided by the platform firmware. + For example, a NULL or default task policy will fall back to the + system default policy. A NULL or default vma policy will fall + back to the task policy. - When a task/process policy or a shared policy contains the Default - mode, this also means "local allocation", as described above. + When specified in one of the memory policy APIs, the Default mode + does not use the optional set of nodes. - In the context of a VMA, Default mode means "fall back to task - policy"--which may or may not specify Default mode. Thus, Default - mode can not be counted on to mean local allocation when used - on a non-shared region of the address space. However, see - MPOL_PREFERRED below. - - The Default mode does not use the optional set of nodes. + It is an error for the set of nodes specified for this policy to + be non-empty. MPOL_BIND: This mode specifies that memory must come from the - set of nodes specified by the policy. - - The memory policy APIs do not specify an order in which the nodes - will be searched. However, unlike "local allocation", the Bind - policy does not consider the distance between the nodes. Rather, - allocations will fallback to the nodes specified by the policy in - order of numeric node id. Like everything in Linux, this is subject - to change. + set of nodes specified by the policy. Memory will be allocated from + the node in the set with sufficient free memory that is closest to + the node where the allocation takes place. MPOL_PREFERRED: This mode specifies that the allocation should be attempted from the single node specified in the policy. If that - allocation fails, the kernel will search other nodes, exactly as - it would for a local allocation that started at the preferred node - in increasing distance from the preferred node. "Local" allocation - policy can be viewed as a Preferred policy that starts at the node + allocation fails, the kernel will search other nodes, in order of + increasing distance from the preferred node based on information + provided by the platform firmware. containing the cpu where the allocation takes place. Internally, the Preferred policy uses a single node--the - preferred_node member of struct mempolicy. A "distinguished - value of this preferred_node, currently '-1', is interpreted - as "the node containing the cpu where the allocation takes - place"--local allocation. This is the way to specify - local allocation for a specific range of addresses--i.e. for - VMA policies. + preferred_node member of struct mempolicy. When the internal + mode flag MPOL_F_LOCAL is set, the preferred_node is ignored and + the policy is interpreted as local allocation. "Local" allocation + policy can be viewed as a Preferred policy that starts at the node + containing the cpu where the allocation takes place. + + It is possible for the user to specify that local allocation is + always preferred by passing an empty nodemask with this mode. + If an empty nodemask is passed, the policy cannot use the + MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags described + below. MPOL_INTERLEAVED: This mode specifies that page allocations be interleaved, on a page granularity, across the nodes specified in @@ -231,6 +212,154 @@ Components of Memory Policies the temporary interleaved system default policy works in this mode. + Linux memory policy supports the following optional mode flags: + + MPOL_F_STATIC_NODES: This flag specifies that the nodemask passed by + the user should not be remapped if the task or VMA's set of allowed + nodes changes after the memory policy has been defined. + + Without this flag, anytime a mempolicy is rebound because of a + change in the set of allowed nodes, the node (Preferred) or + nodemask (Bind, Interleave) is remapped to the new set of + allowed nodes. This may result in nodes being used that were + previously undesired. + + With this flag, if the user-specified nodes overlap with the + nodes allowed by the task's cpuset, then the memory policy is + applied to their intersection. If the two sets of nodes do not + overlap, the Default policy is used. + + For example, consider a task that is attached to a cpuset with + mems 1-3 that sets an Interleave policy over the same set. If + the cpuset's mems change to 3-5, the Interleave will now occur + over nodes 3, 4, and 5. With this flag, however, since only node + 3 is allowed from the user's nodemask, the "interleave" only + occurs over that node. If no nodes from the user's nodemask are + now allowed, the Default behavior is used. + + MPOL_F_STATIC_NODES cannot be combined with the + MPOL_F_RELATIVE_NODES flag. It also cannot be used for + MPOL_PREFERRED policies that were created with an empty nodemask + (local allocation). + + MPOL_F_RELATIVE_NODES: This flag specifies that the nodemask passed + by the user will be mapped relative to the set of the task or VMA's + set of allowed nodes. The kernel stores the user-passed nodemask, + and if the allowed nodes changes, then that original nodemask will + be remapped relative to the new set of allowed nodes. + + Without this flag (and without MPOL_F_STATIC_NODES), anytime a + mempolicy is rebound because of a change in the set of allowed + nodes, the node (Preferred) or nodemask (Bind, Interleave) is + remapped to the new set of allowed nodes. That remap may not + preserve the relative nature of the user's passed nodemask to its + set of allowed nodes upon successive rebinds: a nodemask of + 1,3,5 may be remapped to 7-9 and then to 1-3 if the set of + allowed nodes is restored to its original state. + + With this flag, the remap is done so that the node numbers from + the user's passed nodemask are relative to the set of allowed + nodes. In other words, if nodes 0, 2, and 4 are set in the user's + nodemask, the policy will be effected over the first (and in the + Bind or Interleave case, the third and fifth) nodes in the set of + allowed nodes. The nodemask passed by the user represents nodes + relative to task or VMA's set of allowed nodes. + + If the user's nodemask includes nodes that are outside the range + of the new set of allowed nodes (for example, node 5 is set in + the user's nodemask when the set of allowed nodes is only 0-3), + then the remap wraps around to the beginning of the nodemask and, + if not already set, sets the node in the mempolicy nodemask. + + For example, consider a task that is attached to a cpuset with + mems 2-5 that sets an Interleave policy over the same set with + MPOL_F_RELATIVE_NODES. If the cpuset's mems change to 3-7, the + interleave now occurs over nodes 3,5-6. If the cpuset's mems + then change to 0,2-3,5, then the interleave occurs over nodes + 0,3,5. + + Thanks to the consistent remapping, applications preparing + nodemasks to specify memory policies using this flag should + disregard their current, actual cpuset imposed memory placement + and prepare the nodemask as if they were always located on + memory nodes 0 to N-1, where N is the number of memory nodes the + policy is intended to manage. Let the kernel then remap to the + set of memory nodes allowed by the task's cpuset, as that may + change over time. + + MPOL_F_RELATIVE_NODES cannot be combined with the + MPOL_F_STATIC_NODES flag. It also cannot be used for + MPOL_PREFERRED policies that were created with an empty nodemask + (local allocation). + +MEMORY POLICY REFERENCE COUNTING + +To resolve use/free races, struct mempolicy contains an atomic reference +count field. Internal interfaces, mpol_get()/mpol_put() increment and +decrement this reference count, respectively. mpol_put() will only free +the structure back to the mempolicy kmem cache when the reference count +goes to zero. + +When a new memory policy is allocated, it's reference count is initialized +to '1', representing the reference held by the task that is installing the +new policy. When a pointer to a memory policy structure is stored in another +structure, another reference is added, as the task's reference will be dropped +on completion of the policy installation. + +During run-time "usage" of the policy, we attempt to minimize atomic operations +on the reference count, as this can lead to cache lines bouncing between cpus +and NUMA nodes. "Usage" here means one of the following: + +1) querying of the policy, either by the task itself [using the get_mempolicy() + API discussed below] or by another task using the /proc/<pid>/numa_maps + interface. + +2) examination of the policy to determine the policy mode and associated node + or node lists, if any, for page allocation. This is considered a "hot + path". Note that for MPOL_BIND, the "usage" extends across the entire + allocation process, which may sleep during page reclaimation, because the + BIND policy nodemask is used, by reference, to filter ineligible nodes. + +We can avoid taking an extra reference during the usages listed above as +follows: + +1) we never need to get/free the system default policy as this is never + changed nor freed, once the system is up and running. + +2) for querying the policy, we do not need to take an extra reference on the + target task's task policy nor vma policies because we always acquire the + task's mm's mmap_sem for read during the query. The set_mempolicy() and + mbind() APIs [see below] always acquire the mmap_sem for write when + installing or replacing task or vma policies. Thus, there is no possibility + of a task or thread freeing a policy while another task or thread is + querying it. + +3) Page allocation usage of task or vma policy occurs in the fault path where + we hold them mmap_sem for read. Again, because replacing the task or vma + policy requires that the mmap_sem be held for write, the policy can't be + freed out from under us while we're using it for page allocation. + +4) Shared policies require special consideration. One task can replace a + shared memory policy while another task, with a distinct mmap_sem, is + querying or allocating a page based on the policy. To resolve this + potential race, the shared policy infrastructure adds an extra reference + to the shared policy during lookup while holding a spin lock on the shared + policy management structure. This requires that we drop this extra + reference when we're finished "using" the policy. We must drop the + extra reference on shared policies in the same query/allocation paths + used for non-shared policies. For this reason, shared policies are marked + as such, and the extra reference is dropped "conditionally"--i.e., only + for shared policies. + + Because of this extra reference counting, and because we must lookup + shared policies in a tree structure under spinlock, shared policies are + more expensive to use in the page allocation path. This is expecially + true for shared policies on shared memory regions shared by tasks running + on different NUMA nodes. This extra overhead can be avoided by always + falling back to task or system default policy for shared memory regions, + or by prefaulting the entire shared memory region into memory and locking + it down. However, this might not be appropriate for all applications. + MEMORY POLICY APIs Linux supports 3 system calls for controlling memory policy. These APIS @@ -251,7 +380,9 @@ Set [Task] Memory Policy: Set's the calling task's "task/process memory policy" to mode specified by the 'mode' argument and the set of nodes defined by 'nmask'. 'nmask' points to a bit mask of node ids containing - at least 'maxnode' ids. + at least 'maxnode' ids. Optional mode flags may be passed by + combining the 'mode' argument with the flag (for example: + MPOL_INTERLEAVE | MPOL_F_STATIC_NODES). See the set_mempolicy(2) man page for more details @@ -303,29 +434,19 @@ MEMORY POLICIES AND CPUSETS Memory policies work within cpusets as described above. For memory policies that require a node or set of nodes, the nodes are restricted to the set of nodes whose memories are allowed by the cpuset constraints. If the nodemask -specified for the policy contains nodes that are not allowed by the cpuset, or -the intersection of the set of nodes specified for the policy and the set of -nodes with memory is the empty set, the policy is considered invalid -and cannot be installed. - -The interaction of memory policies and cpusets can be problematic for a -couple of reasons: - -1) the memory policy APIs take physical node id's as arguments. As mentioned - above, it is illegal to specify nodes that are not allowed in the cpuset. - The application must query the allowed nodes using the get_mempolicy() - API with the MPOL_F_MEMS_ALLOWED flag to determine the allowed nodes and - restrict itself to those nodes. However, the resources available to a - cpuset can be changed by the system administrator, or a workload manager - application, at any time. So, a task may still get errors attempting to - specify policy nodes, and must query the allowed memories again. - -2) when tasks in two cpusets share access to a memory region, such as shared - memory segments created by shmget() of mmap() with the MAP_ANONYMOUS and - MAP_SHARED flags, and any of the tasks install shared policy on the region, - only nodes whose memories are allowed in both cpusets may be used in the - policies. Obtaining this information requires "stepping outside" the - memory policy APIs to use the cpuset information and requires that one - know in what cpusets other task might be attaching to the shared region. - Furthermore, if the cpusets' allowed memory sets are disjoint, "local" - allocation is the only valid policy. +specified for the policy contains nodes that are not allowed by the cpuset and +MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes +specified for the policy and the set of nodes with memory is used. If the +result is the empty set, the policy is considered invalid and cannot be +installed. If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped +onto and folded into the task's set of allowed nodes as previously described. + +The interaction of memory policies and cpusets can be problematic when tasks +in two cpusets share access to a memory region, such as shared memory segments +created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and +any of the tasks install shared policy on the region, only nodes whose +memories are allowed in both cpusets may be used in the policies. Obtaining +this information requires "stepping outside" the memory policy APIs to use the +cpuset information and requires that one know in what cpusets other task might +be attaching to the shared region. Furthermore, if the cpusets' allowed +memory sets are disjoint, "local" allocation is the only valid policy. diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index 22d7e3e4d60..d3ce295bffa 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c @@ -31,7 +31,7 @@ struct slabinfo { int hwcache_align, object_size, objs_per_slab; int sanity_checks, slab_size, store_user, trace; int order, poison, reclaim_account, red_zone; - unsigned long partial, objects, slabs; + unsigned long partial, objects, slabs, objects_partial, objects_total; unsigned long alloc_fastpath, alloc_slowpath; unsigned long free_fastpath, free_slowpath; unsigned long free_frozen, free_add_partial, free_remove_partial; @@ -540,7 +540,8 @@ void slabcache(struct slabinfo *s) return; store_size(size_str, slab_size(s)); - snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs); + snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, + s->partial, s->cpu_slabs); if (!line++) first_line(); @@ -776,7 +777,6 @@ void totals(void) unsigned long used; unsigned long long wasted; unsigned long long objwaste; - long long objects_in_partial_slabs; unsigned long percentage_partial_slabs; unsigned long percentage_partial_objs; @@ -790,18 +790,11 @@ void totals(void) wasted = size - used; objwaste = s->slab_size - s->object_size; - objects_in_partial_slabs = s->objects - - (s->slabs - s->partial - s ->cpu_slabs) * - s->objs_per_slab; - - if (objects_in_partial_slabs < 0) - objects_in_partial_slabs = 0; - percentage_partial_slabs = s->partial * 100 / s->slabs; if (percentage_partial_slabs > 100) percentage_partial_slabs = 100; - percentage_partial_objs = objects_in_partial_slabs * 100 + percentage_partial_objs = s->objects_partial * 100 / s->objects; if (percentage_partial_objs > 100) @@ -823,8 +816,8 @@ void totals(void) min_objects = s->objects; if (used < min_used) min_used = used; - if (objects_in_partial_slabs < min_partobj) - min_partobj = objects_in_partial_slabs; + if (s->objects_partial < min_partobj) + min_partobj = s->objects_partial; if (percentage_partial_slabs < min_ppart) min_ppart = percentage_partial_slabs; if (percentage_partial_objs < min_ppartobj) @@ -848,8 +841,8 @@ void totals(void) max_objects = s->objects; if (used > max_used) max_used = used; - if (objects_in_partial_slabs > max_partobj) - max_partobj = objects_in_partial_slabs; + if (s->objects_partial > max_partobj) + max_partobj = s->objects_partial; if (percentage_partial_slabs > max_ppart) max_ppart = percentage_partial_slabs; if (percentage_partial_objs > max_ppartobj) @@ -864,7 +857,7 @@ void totals(void) total_objects += s->objects; total_used += used; - total_partobj += objects_in_partial_slabs; + total_partobj += s->objects_partial; total_ppart += percentage_partial_slabs; total_ppartobj += percentage_partial_objs; @@ -1160,6 +1153,8 @@ void read_slab_dir(void) slab->hwcache_align = get_obj("hwcache_align"); slab->object_size = get_obj("object_size"); slab->objects = get_obj("objects"); + slab->objects_partial = get_obj("objects_partial"); + slab->objects_total = get_obj("objects_total"); slab->objs_per_slab = get_obj("objs_per_slab"); slab->order = get_obj("order"); slab->partial = get_obj("partial"); |