aboutsummaryrefslogtreecommitdiff
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
authorRoland Dreier <rolandd@cisco.com>2007-03-04 16:15:11 -0800
committerRoland Dreier <rolandd@cisco.com>2007-05-08 18:00:37 -0700
commitf7c6a7b5d59980b076abbf2ceeb8735591290285 (patch)
tree29c35b47052bba87f031a4744d8ad12ff5187149 /drivers/infiniband/core
parent36f021b579d195cdc5fa6f3e2bab198b4bf70643 (diff)
IB/uverbs: Export ib_umem_get()/ib_umem_release() to modules
Export ib_umem_get()/ib_umem_release() and put low-level drivers in control of when to call ib_umem_get() to pin and DMA map userspace, rather than always calling it in ib_uverbs_reg_mr() before calling the low-level driver's reg_user_mr method. Also move these functions to be in the ib_core module instead of ib_uverbs, so that driver modules using them do not depend on ib_uverbs. This has a number of advantages: - It is better design from the standpoint of making generic code a library that can be used or overridden by device-specific code as the details of specific devices dictate. - Drivers that do not need to pin userspace memory regions do not need to take the performance hit of calling ib_mem_get(). For example, although I have not tried to implement it in this patch, the ipath driver should be able to avoid pinning memory and just use copy_{to,from}_user() to access userspace memory regions. - Buffers that need special mapping treatment can be identified by the low-level driver. For example, it may be possible to solve some Altix-specific memory ordering issues with mthca CQs in userspace by mapping CQ buffers with extra flags. - Drivers that need to pin and DMA map userspace memory for things other than memory regions can use ib_umem_get() directly, instead of hacks using extra parameters to their reg_phys_mr method. For example, the mlx4 driver that is pending being merged needs to pin and DMA map QP and CQ buffers, but it does not need to create a memory key for these buffers. So the cleanest solution is for mlx4 to call ib_umem_get() in the create_qp and create_cq methods. Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile4
-rw-r--r--drivers/infiniband/core/device.c2
-rw-r--r--drivers/infiniband/core/umem.c (renamed from drivers/infiniband/core/uverbs_mem.c)136
-rw-r--r--drivers/infiniband/core/uverbs.h6
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c60
-rw-r--r--drivers/infiniband/core/uverbs_main.c11
6 files changed, 120 insertions, 99 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 189e5d4b9b1..cb1ab3ea499 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
device.o fmr_pool.o cache.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
@@ -28,5 +29,4 @@ ib_umad-y := user_mad.o
ib_ucm-y := ucm.o
-ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_mem.o \
- uverbs_marshall.o
+ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7fabb425b03..592c90aa318 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -613,6 +613,8 @@ static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
ib_sysfs_cleanup();
+ /* Make sure that any pending umem accounting work is done. */
+ flush_scheduled_work();
}
module_init(ib_core_init);
diff --git a/drivers/infiniband/core/uverbs_mem.c b/drivers/infiniband/core/umem.c
index c95fe952abd..48e854cf416 100644
--- a/drivers/infiniband/core/uverbs_mem.c
+++ b/drivers/infiniband/core/umem.c
@@ -64,35 +64,56 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
}
}
-int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
- void *addr, size_t size, int write)
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access)
{
+ struct ib_umem *umem;
struct page **page_list;
struct ib_umem_chunk *chunk;
unsigned long locked;
unsigned long lock_limit;
unsigned long cur_base;
unsigned long npages;
- int ret = 0;
+ int ret;
int off;
int i;
if (!can_do_mlock())
- return -EPERM;
+ return ERR_PTR(-EPERM);
- page_list = (struct page **) __get_free_page(GFP_KERNEL);
- if (!page_list)
- return -ENOMEM;
+ umem = kmalloc(sizeof *umem, GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
- mem->user_base = (unsigned long) addr;
- mem->length = size;
- mem->offset = (unsigned long) addr & ~PAGE_MASK;
- mem->page_size = PAGE_SIZE;
- mem->writable = write;
+ umem->context = context;
+ umem->length = size;
+ umem->offset = addr & ~PAGE_MASK;
+ umem->page_size = PAGE_SIZE;
+ /*
+ * We ask for writable memory if any access flags other than
+ * "remote read" are set. "Local write" and "remote write"
+ * obviously require write access. "Remote atomic" can do
+ * things like fetch and add, which will modify memory, and
+ * "MW bind" can change permissions by binding a window.
+ */
+ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
- INIT_LIST_HEAD(&mem->chunk_list);
+ INIT_LIST_HEAD(&umem->chunk_list);
+
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list) {
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
+ }
- npages = PAGE_ALIGN(size + mem->offset) >> PAGE_SHIFT;
+ npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
down_write(&current->mm->mmap_sem);
@@ -104,13 +125,13 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
goto out;
}
- cur_base = (unsigned long) addr & PAGE_MASK;
+ cur_base = addr & PAGE_MASK;
while (npages) {
ret = get_user_pages(current, current->mm, cur_base,
min_t(int, npages,
PAGE_SIZE / sizeof (struct page *)),
- 1, !write, page_list, NULL);
+ 1, !umem->writable, page_list, NULL);
if (ret < 0)
goto out;
@@ -136,7 +157,7 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
chunk->page_list[i].length = PAGE_SIZE;
}
- chunk->nmap = ib_dma_map_sg(dev,
+ chunk->nmap = ib_dma_map_sg(context->device,
&chunk->page_list[0],
chunk->nents,
DMA_BIDIRECTIONAL);
@@ -151,33 +172,25 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
ret -= chunk->nents;
off += chunk->nents;
- list_add_tail(&chunk->list, &mem->chunk_list);
+ list_add_tail(&chunk->list, &umem->chunk_list);
}
ret = 0;
}
out:
- if (ret < 0)
- __ib_umem_release(dev, mem, 0);
- else
+ if (ret < 0) {
+ __ib_umem_release(context->device, umem, 0);
+ kfree(umem);
+ } else
current->mm->locked_vm = locked;
up_write(&current->mm->mmap_sem);
free_page((unsigned long) page_list);
- return ret;
-}
-
-void ib_umem_release(struct ib_device *dev, struct ib_umem *umem)
-{
- __ib_umem_release(dev, umem, 1);
-
- down_write(&current->mm->mmap_sem);
- current->mm->locked_vm -=
- PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
- up_write(&current->mm->mmap_sem);
+ return ret < 0 ? ERR_PTR(ret) : umem;
}
+EXPORT_SYMBOL(ib_umem_get);
static void ib_umem_account(struct work_struct *_work)
{
@@ -191,35 +204,70 @@ static void ib_umem_account(struct work_struct *_work)
kfree(work);
}
-void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem)
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
{
struct ib_umem_account_work *work;
+ struct ib_ucontext *context = umem->context;
struct mm_struct *mm;
+ unsigned long diff;
- __ib_umem_release(dev, umem, 1);
+ __ib_umem_release(umem->context->device, umem, 1);
mm = get_task_mm(current);
if (!mm)
return;
+ diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+ kfree(umem);
+
/*
* We may be called with the mm's mmap_sem already held. This
* can happen when a userspace munmap() is the call that drops
* the last reference to our file and calls our release
* method. If there are memory regions to destroy, we'll end
- * up here and not be able to take the mmap_sem. Therefore we
- * defer the vm_locked accounting to the system workqueue.
+ * up here and not be able to take the mmap_sem. In that case
+ * we defer the vm_locked accounting to the system workqueue.
*/
+ if (context->closing && !down_write_trylock(&mm->mmap_sem)) {
+ work = kmalloc(sizeof *work, GFP_KERNEL);
+ if (!work) {
+ mmput(mm);
+ return;
+ }
- work = kmalloc(sizeof *work, GFP_KERNEL);
- if (!work) {
- mmput(mm);
+ INIT_WORK(&work->work, ib_umem_account);
+ work->mm = mm;
+ work->diff = diff;
+
+ schedule_work(&work->work);
return;
- }
+ } else
+ down_write(&mm->mmap_sem);
+
+ current->mm->locked_vm -= diff;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+ struct ib_umem_chunk *chunk;
+ int shift;
+ int i;
+ int n;
+
+ shift = ilog2(umem->page_size);
- INIT_WORK(&work->work, ib_umem_account);
- work->mm = mm;
- work->diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+ n = 0;
+ list_for_each_entry(chunk, &umem->chunk_list, list)
+ for (i = 0; i < chunk->nmap; ++i)
+ n += sg_dma_len(&chunk->page_list[i]) >> shift;
- schedule_work(&work->work);
+ return n;
}
+EXPORT_SYMBOL(ib_umem_page_count);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 102a59c033f..c33546f9e96 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -45,6 +45,7 @@
#include <linux/completion.h>
#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
/*
@@ -163,11 +164,6 @@ void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_event_handler(struct ib_event_handler *handler,
struct ib_event *event);
-int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
- void *addr, size_t size, int write);
-void ib_umem_release(struct ib_device *dev, struct ib_umem *umem);
-void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem);
-
#define IB_UVERBS_DECLARE_CMD(name) \
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
const char __user *buf, int in_len, \
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index bab66769be1..01d70084aeb 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1,6 +1,6 @@
/*
* Copyright (c) 2005 Topspin Communications. All rights reserved.
- * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
* Copyright (c) 2006 Mellanox Technologies. All rights reserved.
*
@@ -295,6 +295,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&ucontext->qp_list);
INIT_LIST_HEAD(&ucontext->srq_list);
INIT_LIST_HEAD(&ucontext->ah_list);
+ ucontext->closing = 0;
resp.num_comp_vectors = file->device->num_comp_vectors;
@@ -573,7 +574,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
struct ib_uverbs_reg_mr cmd;
struct ib_uverbs_reg_mr_resp resp;
struct ib_udata udata;
- struct ib_umem_object *obj;
+ struct ib_uobject *uobj;
struct ib_pd *pd;
struct ib_mr *mr;
int ret;
@@ -599,35 +600,21 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
!(cmd.access_flags & IB_ACCESS_LOCAL_WRITE))
return -EINVAL;
- obj = kmalloc(sizeof *obj, GFP_KERNEL);
- if (!obj)
+ uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+ if (!uobj)
return -ENOMEM;
- init_uobj(&obj->uobject, 0, file->ucontext, &mr_lock_key);
- down_write(&obj->uobject.mutex);
-
- /*
- * We ask for writable memory if any access flags other than
- * "remote read" are set. "Local write" and "remote write"
- * obviously require write access. "Remote atomic" can do
- * things like fetch and add, which will modify memory, and
- * "MW bind" can change permissions by binding a window.
- */
- ret = ib_umem_get(file->device->ib_dev, &obj->umem,
- (void *) (unsigned long) cmd.start, cmd.length,
- !!(cmd.access_flags & ~IB_ACCESS_REMOTE_READ));
- if (ret)
- goto err_free;
-
- obj->umem.virt_base = cmd.hca_va;
+ init_uobj(uobj, 0, file->ucontext, &mr_lock_key);
+ down_write(&uobj->mutex);
pd = idr_read_pd(cmd.pd_handle, file->ucontext);
if (!pd) {
ret = -EINVAL;
- goto err_release;
+ goto err_free;
}
- mr = pd->device->reg_user_mr(pd, &obj->umem, cmd.access_flags, &udata);
+ mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+ cmd.access_flags, &udata);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
goto err_put;
@@ -635,19 +622,19 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
mr->device = pd->device;
mr->pd = pd;
- mr->uobject = &obj->uobject;
+ mr->uobject = uobj;
atomic_inc(&pd->usecnt);
atomic_set(&mr->usecnt, 0);
- obj->uobject.object = mr;
- ret = idr_add_uobj(&ib_uverbs_mr_idr, &obj->uobject);
+ uobj->object = mr;
+ ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
if (ret)
goto err_unreg;
memset(&resp, 0, sizeof resp);
resp.lkey = mr->lkey;
resp.rkey = mr->rkey;
- resp.mr_handle = obj->uobject.id;
+ resp.mr_handle = uobj->id;
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp)) {
@@ -658,17 +645,17 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
put_pd_read(pd);
mutex_lock(&file->mutex);
- list_add_tail(&obj->uobject.list, &file->ucontext->mr_list);
+ list_add_tail(&uobj->list, &file->ucontext->mr_list);
mutex_unlock(&file->mutex);
- obj->uobject.live = 1;
+ uobj->live = 1;
- up_write(&obj->uobject.mutex);
+ up_write(&uobj->mutex);
return in_len;
err_copy:
- idr_remove_uobj(&ib_uverbs_mr_idr, &obj->uobject);
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
err_unreg:
ib_dereg_mr(mr);
@@ -676,11 +663,8 @@ err_unreg:
err_put:
put_pd_read(pd);
-err_release:
- ib_umem_release(file->device->ib_dev, &obj->umem);
-
err_free:
- put_uobj_write(&obj->uobject);
+ put_uobj_write(uobj);
return ret;
}
@@ -691,7 +675,6 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
struct ib_uverbs_dereg_mr cmd;
struct ib_mr *mr;
struct ib_uobject *uobj;
- struct ib_umem_object *memobj;
int ret = -EINVAL;
if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -701,8 +684,7 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
if (!uobj)
return -EINVAL;
- memobj = container_of(uobj, struct ib_umem_object, uobject);
- mr = uobj->object;
+ mr = uobj->object;
ret = ib_dereg_mr(mr);
if (!ret)
@@ -719,8 +701,6 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
list_del(&uobj->list);
mutex_unlock(&file->mutex);
- ib_umem_release(file->device->ib_dev, &memobj->umem);
-
put_uobj(uobj);
return in_len;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index d44e5479965..14d7ccd8919 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -183,6 +183,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
if (!context)
return 0;
+ context->closing = 1;
+
list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
struct ib_ah *ah = uobj->object;
@@ -230,16 +232,10 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
struct ib_mr *mr = uobj->object;
- struct ib_device *mrdev = mr->device;
- struct ib_umem_object *memobj;
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
ib_dereg_mr(mr);
-
- memobj = container_of(uobj, struct ib_umem_object, uobject);
- ib_umem_release_on_close(mrdev, &memobj->umem);
-
- kfree(memobj);
+ kfree(uobj);
}
list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
@@ -906,7 +902,6 @@ static void __exit ib_uverbs_cleanup(void)
unregister_filesystem(&uverbs_event_fs);
class_destroy(uverbs_class);
unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
- flush_scheduled_work();
idr_destroy(&ib_uverbs_pd_idr);
idr_destroy(&ib_uverbs_mr_idr);
idr_destroy(&ib_uverbs_mw_idr);