kernel-ark/drivers/vfio/vfio_iommu_spapr_tce.c
Alexey Kardashevskiy 4b6fad7097 powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown
At the moment the userspace tool is expected to request pinning of
the entire guest RAM when VFIO IOMMU SPAPR v2 driver is present.
When the userspace process finishes, all the pinned pages need to
be put; this is done as a part of the userspace memory context (MM)
destruction which happens on the very last mmdrop().

This approach has a problem that a MM of the userspace process
may live longer than the userspace process itself as kernel threads
use userspace process MMs which was runnning on a CPU where
the kernel thread was scheduled to. If this happened, the MM remains
referenced until this exact kernel thread wakes up again
and releases the very last reference to the MM, on an idle system this
can take even hours.

This moves preregistered regions tracking from MM to VFIO; insteads of
using mm_iommu_table_group_mem_t::used, tce_container::prereg_list is
added so each container releases regions which it has pre-registered.

This changes the userspace interface to return EBUSY if a memory
region is already registered in a container. However it should not
have any practical effect as the only userspace tool available now
does register memory region once per container anyway.

As tce_iommu_register_pages/tce_iommu_unregister_pages are called
under container->lock, this does not need additional locking.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-12-02 14:38:34 +11:00

1397 lines
32 KiB
C

/*
* VFIO: IOMMU DMA mapping support for TCE on POWER
*
* Copyright (C) 2013 IBM Corp. All rights reserved.
* Author: Alexey Kardashevskiy <aik@ozlabs.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Derived from original vfio_iommu_type1.c:
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*/
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/err.h>
#include <linux/vfio.h>
#include <linux/vmalloc.h>
#include <asm/iommu.h>
#include <asm/tce.h>
#include <asm/mmu_context.h>
#define DRIVER_VERSION "0.1"
#define DRIVER_AUTHOR "aik@ozlabs.ru"
#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
static long try_increment_locked_vm(struct mm_struct *mm, long npages)
{
long ret = 0, locked, lock_limit;
if (WARN_ON_ONCE(!mm))
return -EPERM;
if (!npages)
return 0;
down_write(&mm->mmap_sem);
locked = mm->locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
ret = -ENOMEM;
else
mm->locked_vm += npages;
pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
npages << PAGE_SHIFT,
mm->locked_vm << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK),
ret ? " - exceeded" : "");
up_write(&mm->mmap_sem);
return ret;
}
static void decrement_locked_vm(struct mm_struct *mm, long npages)
{
if (!mm || !npages)
return;
down_write(&mm->mmap_sem);
if (WARN_ON_ONCE(npages > mm->locked_vm))
npages = mm->locked_vm;
mm->locked_vm -= npages;
pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
npages << PAGE_SHIFT,
mm->locked_vm << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
up_write(&mm->mmap_sem);
}
/*
* VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
*
* This code handles mapping and unmapping of user data buffers
* into DMA'ble space using the IOMMU
*/
struct tce_iommu_group {
struct list_head next;
struct iommu_group *grp;
};
/*
* A container needs to remember which preregistered region it has
* referenced to do proper cleanup at the userspace process exit.
*/
struct tce_iommu_prereg {
struct list_head next;
struct mm_iommu_table_group_mem_t *mem;
};
/*
* The container descriptor supports only a single group per container.
* Required by the API as the container is not supplied with the IOMMU group
* at the moment of initialization.
*/
struct tce_container {
struct mutex lock;
bool enabled;
bool v2;
bool def_window_pending;
unsigned long locked_pages;
struct mm_struct *mm;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct list_head group_list;
struct list_head prereg_list;
};
static long tce_iommu_mm_set(struct tce_container *container)
{
if (container->mm) {
if (container->mm == current->mm)
return 0;
return -EPERM;
}
BUG_ON(!current->mm);
container->mm = current->mm;
atomic_inc(&container->mm->mm_count);
return 0;
}
static long tce_iommu_prereg_free(struct tce_container *container,
struct tce_iommu_prereg *tcemem)
{
long ret;
ret = mm_iommu_put(container->mm, tcemem->mem);
if (ret)
return ret;
list_del(&tcemem->next);
kfree(tcemem);
return 0;
}
static long tce_iommu_unregister_pages(struct tce_container *container,
__u64 vaddr, __u64 size)
{
struct mm_iommu_table_group_mem_t *mem;
struct tce_iommu_prereg *tcemem;
bool found = false;
if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
return -EINVAL;
mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT);
if (!mem)
return -ENOENT;
list_for_each_entry(tcemem, &container->prereg_list, next) {
if (tcemem->mem == mem) {
found = true;
break;
}
}
if (!found)
return -ENOENT;
return tce_iommu_prereg_free(container, tcemem);
}
static long tce_iommu_register_pages(struct tce_container *container,
__u64 vaddr, __u64 size)
{
long ret = 0;
struct mm_iommu_table_group_mem_t *mem = NULL;
struct tce_iommu_prereg *tcemem;
unsigned long entries = size >> PAGE_SHIFT;
if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
((vaddr + size) < vaddr))
return -EINVAL;
mem = mm_iommu_find(container->mm, vaddr, entries);
if (mem) {
list_for_each_entry(tcemem, &container->prereg_list, next) {
if (tcemem->mem == mem)
return -EBUSY;
}
}
ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
if (ret)
return ret;
tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
tcemem->mem = mem;
list_add(&tcemem->next, &container->prereg_list);
container->enabled = true;
return 0;
}
static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl,
struct mm_struct *mm)
{
unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
tbl->it_size, PAGE_SIZE);
unsigned long *uas;
long ret;
BUG_ON(tbl->it_userspace);
ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT);
if (ret)
return ret;
uas = vzalloc(cb);
if (!uas) {
decrement_locked_vm(mm, cb >> PAGE_SHIFT);
return -ENOMEM;
}
tbl->it_userspace = uas;
return 0;
}
static void tce_iommu_userspace_view_free(struct iommu_table *tbl,
struct mm_struct *mm)
{
unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
tbl->it_size, PAGE_SIZE);
if (!tbl->it_userspace)
return;
vfree(tbl->it_userspace);
tbl->it_userspace = NULL;
decrement_locked_vm(mm, cb >> PAGE_SHIFT);
}
static bool tce_page_is_contained(struct page *page, unsigned page_shift)
{
/*
* Check that the TCE table granularity is not bigger than the size of
* a page we just found. Otherwise the hardware can get access to
* a bigger memory chunk that it should.
*/
return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
}
static inline bool tce_groups_attached(struct tce_container *container)
{
return !list_empty(&container->group_list);
}
static long tce_iommu_find_table(struct tce_container *container,
phys_addr_t ioba, struct iommu_table **ptbl)
{
long i;
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
struct iommu_table *tbl = container->tables[i];
if (tbl) {
unsigned long entry = ioba >> tbl->it_page_shift;
unsigned long start = tbl->it_offset;
unsigned long end = start + tbl->it_size;
if ((start <= entry) && (entry < end)) {
*ptbl = tbl;
return i;
}
}
}
return -1;
}
static int tce_iommu_find_free_table(struct tce_container *container)
{
int i;
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
if (!container->tables[i])
return i;
}
return -ENOSPC;
}
static int tce_iommu_enable(struct tce_container *container)
{
int ret = 0;
unsigned long locked;
struct iommu_table_group *table_group;
struct tce_iommu_group *tcegrp;
if (container->enabled)
return -EBUSY;
/*
* When userspace pages are mapped into the IOMMU, they are effectively
* locked memory, so, theoretically, we need to update the accounting
* of locked pages on each map and unmap. For powerpc, the map unmap
* paths can be very hot, though, and the accounting would kill
* performance, especially since it would be difficult to impossible
* to handle the accounting in real mode only.
*
* To address that, rather than precisely accounting every page, we
* instead account for a worst case on locked memory when the iommu is
* enabled and disabled. The worst case upper bound on locked memory
* is the size of the whole iommu window, which is usually relatively
* small (compared to total memory sizes) on POWER hardware.
*
* Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
* that would effectively kill the guest at random points, much better
* enforcing the limit based on the max that the guest can map.
*
* Unfortunately at the moment it counts whole tables, no matter how
* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
* each with 2GB DMA window, 8GB will be counted here. The reason for
* this is that we cannot tell here the amount of RAM used by the guest
* as this information is only available from KVM and VFIO is
* KVM agnostic.
*
* So we do not allow enabling a container without a group attached
* as there is no way to know how much we should increment
* the locked_vm counter.
*/
if (!tce_groups_attached(container))
return -ENODEV;
tcegrp = list_first_entry(&container->group_list,
struct tce_iommu_group, next);
table_group = iommu_group_get_iommudata(tcegrp->grp);
if (!table_group)
return -ENODEV;
if (!table_group->tce32_size)
return -EPERM;
ret = tce_iommu_mm_set(container);
if (ret)
return ret;
locked = table_group->tce32_size >> PAGE_SHIFT;
ret = try_increment_locked_vm(container->mm, locked);
if (ret)
return ret;
container->locked_pages = locked;
container->enabled = true;
return ret;
}
static void tce_iommu_disable(struct tce_container *container)
{
if (!container->enabled)
return;
container->enabled = false;
BUG_ON(!container->mm);
decrement_locked_vm(container->mm, container->locked_pages);
}
static void *tce_iommu_open(unsigned long arg)
{
struct tce_container *container;
if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
pr_err("tce_vfio: Wrong IOMMU type\n");
return ERR_PTR(-EINVAL);
}
container = kzalloc(sizeof(*container), GFP_KERNEL);
if (!container)
return ERR_PTR(-ENOMEM);
mutex_init(&container->lock);
INIT_LIST_HEAD_RCU(&container->group_list);
INIT_LIST_HEAD_RCU(&container->prereg_list);
container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
return container;
}
static int tce_iommu_clear(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long pages);
static void tce_iommu_free_table(struct tce_container *container,
struct iommu_table *tbl);
static void tce_iommu_release(void *iommu_data)
{
struct tce_container *container = iommu_data;
struct tce_iommu_group *tcegrp;
long i;
while (tce_groups_attached(container)) {
tcegrp = list_first_entry(&container->group_list,
struct tce_iommu_group, next);
tce_iommu_detach_group(iommu_data, tcegrp->grp);
}
/*
* If VFIO created a table, it was not disposed
* by tce_iommu_detach_group() so do it now.
*/
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
struct iommu_table *tbl = container->tables[i];
if (!tbl)
continue;
tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
tce_iommu_free_table(container, tbl);
}
while (!list_empty(&container->prereg_list)) {
struct tce_iommu_prereg *tcemem;
tcemem = list_first_entry(&container->prereg_list,
struct tce_iommu_prereg, next);
WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
}
tce_iommu_disable(container);
if (container->mm)
mmdrop(container->mm);
mutex_destroy(&container->lock);
kfree(container);
}
static void tce_iommu_unuse_page(struct tce_container *container,
unsigned long hpa)
{
struct page *page;
page = pfn_to_page(hpa >> PAGE_SHIFT);
put_page(page);
}
static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
unsigned long tce, unsigned long size,
unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
{
long ret = 0;
struct mm_iommu_table_group_mem_t *mem;
mem = mm_iommu_lookup(container->mm, tce, size);
if (!mem)
return -EINVAL;
ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
if (ret)
return -EINVAL;
*pmem = mem;
return 0;
}
static void tce_iommu_unuse_page_v2(struct tce_container *container,
struct iommu_table *tbl, unsigned long entry)
{
struct mm_iommu_table_group_mem_t *mem = NULL;
int ret;
unsigned long hpa = 0;
unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
if (!pua)
return;
ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl),
&hpa, &mem);
if (ret)
pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
__func__, *pua, entry, ret);
if (mem)
mm_iommu_mapped_dec(mem);
*pua = 0;
}
static int tce_iommu_clear(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long pages)
{
unsigned long oldhpa;
long ret;
enum dma_data_direction direction;
for ( ; pages; --pages, ++entry) {
direction = DMA_NONE;
oldhpa = 0;
ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
if (ret)
continue;
if (direction == DMA_NONE)
continue;
if (container->v2) {
tce_iommu_unuse_page_v2(container, tbl, entry);
continue;
}
tce_iommu_unuse_page(container, oldhpa);
}
return 0;
}
static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
{
struct page *page = NULL;
enum dma_data_direction direction = iommu_tce_direction(tce);
if (get_user_pages_fast(tce & PAGE_MASK, 1,
direction != DMA_TO_DEVICE, &page) != 1)
return -EFAULT;
*hpa = __pa((unsigned long) page_address(page));
return 0;
}
static long tce_iommu_build(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long tce, unsigned long pages,
enum dma_data_direction direction)
{
long i, ret = 0;
struct page *page;
unsigned long hpa;
enum dma_data_direction dirtmp;
for (i = 0; i < pages; ++i) {
unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
ret = tce_iommu_use_page(tce, &hpa);
if (ret)
break;
page = pfn_to_page(hpa >> PAGE_SHIFT);
if (!tce_page_is_contained(page, tbl->it_page_shift)) {
ret = -EPERM;
break;
}
hpa |= offset;
dirtmp = direction;
ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
if (ret) {
tce_iommu_unuse_page(container, hpa);
pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
__func__, entry << tbl->it_page_shift,
tce, ret);
break;
}
if (dirtmp != DMA_NONE)
tce_iommu_unuse_page(container, hpa);
tce += IOMMU_PAGE_SIZE(tbl);
}
if (ret)
tce_iommu_clear(container, tbl, entry, i);
return ret;
}
static long tce_iommu_build_v2(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long tce, unsigned long pages,
enum dma_data_direction direction)
{
long i, ret = 0;
struct page *page;
unsigned long hpa;
enum dma_data_direction dirtmp;
if (!tbl->it_userspace) {
ret = tce_iommu_userspace_view_alloc(tbl, container->mm);
if (ret)
return ret;
}
for (i = 0; i < pages; ++i) {
struct mm_iommu_table_group_mem_t *mem = NULL;
unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
entry + i);
ret = tce_iommu_prereg_ua_to_hpa(container,
tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem);
if (ret)
break;
page = pfn_to_page(hpa >> PAGE_SHIFT);
if (!tce_page_is_contained(page, tbl->it_page_shift)) {
ret = -EPERM;
break;
}
/* Preserve offset within IOMMU page */
hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
dirtmp = direction;
/* The registered region is being unregistered */
if (mm_iommu_mapped_inc(mem))
break;
ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
if (ret) {
/* dirtmp cannot be DMA_NONE here */
tce_iommu_unuse_page_v2(container, tbl, entry + i);
pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
__func__, entry << tbl->it_page_shift,
tce, ret);
break;
}
if (dirtmp != DMA_NONE)
tce_iommu_unuse_page_v2(container, tbl, entry + i);
*pua = tce;
tce += IOMMU_PAGE_SIZE(tbl);
}
if (ret)
tce_iommu_clear(container, tbl, entry, i);
return ret;
}
static long tce_iommu_create_table(struct tce_container *container,
struct iommu_table_group *table_group,
int num,
__u32 page_shift,
__u64 window_size,
__u32 levels,
struct iommu_table **ptbl)
{
long ret, table_size;
table_size = table_group->ops->get_table_size(page_shift, window_size,
levels);
if (!table_size)
return -EINVAL;
ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
if (ret)
return ret;
ret = table_group->ops->create_table(table_group, num,
page_shift, window_size, levels, ptbl);
WARN_ON(!ret && !(*ptbl)->it_ops->free);
WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
return ret;
}
static void tce_iommu_free_table(struct tce_container *container,
struct iommu_table *tbl)
{
unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
tce_iommu_userspace_view_free(tbl, container->mm);
tbl->it_ops->free(tbl);
decrement_locked_vm(container->mm, pages);
}
static long tce_iommu_create_window(struct tce_container *container,
__u32 page_shift, __u64 window_size, __u32 levels,
__u64 *start_addr)
{
struct tce_iommu_group *tcegrp;
struct iommu_table_group *table_group;
struct iommu_table *tbl = NULL;
long ret, num;
num = tce_iommu_find_free_table(container);
if (num < 0)
return num;
/* Get the first group for ops::create_table */
tcegrp = list_first_entry(&container->group_list,
struct tce_iommu_group, next);
table_group = iommu_group_get_iommudata(tcegrp->grp);
if (!table_group)
return -EFAULT;
if (!(table_group->pgsizes & (1ULL << page_shift)))
return -EINVAL;
if (!table_group->ops->set_window || !table_group->ops->unset_window ||
!table_group->ops->get_table_size ||
!table_group->ops->create_table)
return -EPERM;
/* Create TCE table */
ret = tce_iommu_create_table(container, table_group, num,
page_shift, window_size, levels, &tbl);
if (ret)
return ret;
BUG_ON(!tbl->it_ops->free);
/*
* Program the table to every group.
* Groups have been tested for compatibility at the attach time.
*/
list_for_each_entry(tcegrp, &container->group_list, next) {
table_group = iommu_group_get_iommudata(tcegrp->grp);
ret = table_group->ops->set_window(table_group, num, tbl);
if (ret)
goto unset_exit;
}
container->tables[num] = tbl;
/* Return start address assigned by platform in create_table() */
*start_addr = tbl->it_offset << tbl->it_page_shift;
return 0;
unset_exit:
list_for_each_entry(tcegrp, &container->group_list, next) {
table_group = iommu_group_get_iommudata(tcegrp->grp);
table_group->ops->unset_window(table_group, num);
}
tce_iommu_free_table(container, tbl);
return ret;
}
static long tce_iommu_remove_window(struct tce_container *container,
__u64 start_addr)
{
struct iommu_table_group *table_group = NULL;
struct iommu_table *tbl;
struct tce_iommu_group *tcegrp;
int num;
num = tce_iommu_find_table(container, start_addr, &tbl);
if (num < 0)
return -EINVAL;
BUG_ON(!tbl->it_size);
/* Detach groups from IOMMUs */
list_for_each_entry(tcegrp, &container->group_list, next) {
table_group = iommu_group_get_iommudata(tcegrp->grp);
/*
* SPAPR TCE IOMMU exposes the default DMA window to
* the guest via dma32_window_start/size of
* VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
* the userspace to remove this window, some do not so
* here we check for the platform capability.
*/
if (!table_group->ops || !table_group->ops->unset_window)
return -EPERM;
table_group->ops->unset_window(table_group, num);
}
/* Free table */
tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
tce_iommu_free_table(container, tbl);
container->tables[num] = NULL;
return 0;
}
static long tce_iommu_create_default_window(struct tce_container *container)
{
long ret;
__u64 start_addr = 0;
struct tce_iommu_group *tcegrp;
struct iommu_table_group *table_group;
if (!container->def_window_pending)
return 0;
if (!tce_groups_attached(container))
return -ENODEV;
tcegrp = list_first_entry(&container->group_list,
struct tce_iommu_group, next);
table_group = iommu_group_get_iommudata(tcegrp->grp);
if (!table_group)
return -ENODEV;
ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
table_group->tce32_size, 1, &start_addr);
WARN_ON_ONCE(!ret && start_addr);
if (!ret)
container->def_window_pending = false;
return ret;
}
static long tce_iommu_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
struct tce_container *container = iommu_data;
unsigned long minsz, ddwsz;
long ret;
switch (cmd) {
case VFIO_CHECK_EXTENSION:
switch (arg) {
case VFIO_SPAPR_TCE_IOMMU:
case VFIO_SPAPR_TCE_v2_IOMMU:
ret = 1;
break;
default:
ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
break;
}
return (ret < 0) ? 0 : ret;
}
/*
* Sanity check to prevent one userspace from manipulating
* another userspace mm.
*/
BUG_ON(!container);
if (container->mm && container->mm != current->mm)
return -EPERM;
switch (cmd) {
case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
struct vfio_iommu_spapr_tce_info info;
struct tce_iommu_group *tcegrp;
struct iommu_table_group *table_group;
if (!tce_groups_attached(container))
return -ENXIO;
tcegrp = list_first_entry(&container->group_list,
struct tce_iommu_group, next);
table_group = iommu_group_get_iommudata(tcegrp->grp);
if (!table_group)
return -ENXIO;
minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
dma32_window_size);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
info.dma32_window_start = table_group->tce32_start;
info.dma32_window_size = table_group->tce32_size;
info.flags = 0;
memset(&info.ddw, 0, sizeof(info.ddw));
if (table_group->max_dynamic_windows_supported &&
container->v2) {
info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
info.ddw.pgsizes = table_group->pgsizes;
info.ddw.max_dynamic_windows_supported =
table_group->max_dynamic_windows_supported;
info.ddw.levels = table_group->max_levels;
}
ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
if (info.argsz >= ddwsz)
minsz = ddwsz;
if (copy_to_user((void __user *)arg, &info, minsz))
return -EFAULT;
return 0;
}
case VFIO_IOMMU_MAP_DMA: {
struct vfio_iommu_type1_dma_map param;
struct iommu_table *tbl = NULL;
long num;
enum dma_data_direction direction;
if (!container->enabled)
return -EPERM;
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
if (copy_from_user(&param, (void __user *)arg, minsz))
return -EFAULT;
if (param.argsz < minsz)
return -EINVAL;
if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
VFIO_DMA_MAP_FLAG_WRITE))
return -EINVAL;
ret = tce_iommu_create_default_window(container);
if (ret)
return ret;
num = tce_iommu_find_table(container, param.iova, &tbl);
if (num < 0)
return -ENXIO;
if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
return -EINVAL;
/* iova is checked by the IOMMU API */
if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
direction = DMA_BIDIRECTIONAL;
else
direction = DMA_TO_DEVICE;
} else {
if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
direction = DMA_FROM_DEVICE;
else
return -EINVAL;
}
ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
if (ret)
return ret;
if (container->v2)
ret = tce_iommu_build_v2(container, tbl,
param.iova >> tbl->it_page_shift,
param.vaddr,
param.size >> tbl->it_page_shift,
direction);
else
ret = tce_iommu_build(container, tbl,
param.iova >> tbl->it_page_shift,
param.vaddr,
param.size >> tbl->it_page_shift,
direction);
iommu_flush_tce(tbl);
return ret;
}
case VFIO_IOMMU_UNMAP_DMA: {
struct vfio_iommu_type1_dma_unmap param;
struct iommu_table *tbl = NULL;
long num;
if (!container->enabled)
return -EPERM;
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
size);
if (copy_from_user(&param, (void __user *)arg, minsz))
return -EFAULT;
if (param.argsz < minsz)
return -EINVAL;
/* No flag is supported now */
if (param.flags)
return -EINVAL;
ret = tce_iommu_create_default_window(container);
if (ret)
return ret;
num = tce_iommu_find_table(container, param.iova, &tbl);
if (num < 0)
return -ENXIO;
if (param.size & ~IOMMU_PAGE_MASK(tbl))
return -EINVAL;
ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
param.size >> tbl->it_page_shift);
if (ret)
return ret;
ret = tce_iommu_clear(container, tbl,
param.iova >> tbl->it_page_shift,
param.size >> tbl->it_page_shift);
iommu_flush_tce(tbl);
return ret;
}
case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
struct vfio_iommu_spapr_register_memory param;
if (!container->v2)
break;
minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
size);
ret = tce_iommu_mm_set(container);
if (ret)
return ret;
if (copy_from_user(&param, (void __user *)arg, minsz))
return -EFAULT;
if (param.argsz < minsz)
return -EINVAL;
/* No flag is supported now */
if (param.flags)
return -EINVAL;
mutex_lock(&container->lock);
ret = tce_iommu_register_pages(container, param.vaddr,
param.size);
mutex_unlock(&container->lock);
return ret;
}
case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
struct vfio_iommu_spapr_register_memory param;
if (!container->v2)
break;
if (!container->mm)
return -EPERM;
minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
size);
if (copy_from_user(&param, (void __user *)arg, minsz))
return -EFAULT;
if (param.argsz < minsz)
return -EINVAL;
/* No flag is supported now */
if (param.flags)
return -EINVAL;
mutex_lock(&container->lock);
ret = tce_iommu_unregister_pages(container, param.vaddr,
param.size);
mutex_unlock(&container->lock);
return ret;
}
case VFIO_IOMMU_ENABLE:
if (container->v2)
break;
mutex_lock(&container->lock);
ret = tce_iommu_enable(container);
mutex_unlock(&container->lock);
return ret;
case VFIO_IOMMU_DISABLE:
if (container->v2)
break;
mutex_lock(&container->lock);
tce_iommu_disable(container);
mutex_unlock(&container->lock);
return 0;
case VFIO_EEH_PE_OP: {
struct tce_iommu_group *tcegrp;
ret = 0;
list_for_each_entry(tcegrp, &container->group_list, next) {
ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
cmd, arg);
if (ret)
return ret;
}
return ret;
}
case VFIO_IOMMU_SPAPR_TCE_CREATE: {
struct vfio_iommu_spapr_tce_create create;
if (!container->v2)
break;
ret = tce_iommu_mm_set(container);
if (ret)
return ret;
if (!tce_groups_attached(container))
return -ENXIO;
minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
start_addr);
if (copy_from_user(&create, (void __user *)arg, minsz))
return -EFAULT;
if (create.argsz < minsz)
return -EINVAL;
if (create.flags)
return -EINVAL;
mutex_lock(&container->lock);
ret = tce_iommu_create_default_window(container);
if (ret)
return ret;
ret = tce_iommu_create_window(container, create.page_shift,
create.window_size, create.levels,
&create.start_addr);
mutex_unlock(&container->lock);
if (!ret && copy_to_user((void __user *)arg, &create, minsz))
ret = -EFAULT;
return ret;
}
case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
struct vfio_iommu_spapr_tce_remove remove;
if (!container->v2)
break;
ret = tce_iommu_mm_set(container);
if (ret)
return ret;
if (!tce_groups_attached(container))
return -ENXIO;
minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
start_addr);
if (copy_from_user(&remove, (void __user *)arg, minsz))
return -EFAULT;
if (remove.argsz < minsz)
return -EINVAL;
if (remove.flags)
return -EINVAL;
if (container->def_window_pending && !remove.start_addr) {
container->def_window_pending = false;
return 0;
}
mutex_lock(&container->lock);
ret = tce_iommu_remove_window(container, remove.start_addr);
mutex_unlock(&container->lock);
return ret;
}
}
return -ENOTTY;
}
static void tce_iommu_release_ownership(struct tce_container *container,
struct iommu_table_group *table_group)
{
int i;
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
struct iommu_table *tbl = container->tables[i];
if (!tbl)
continue;
tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
tce_iommu_userspace_view_free(tbl, container->mm);
if (tbl->it_map)
iommu_release_ownership(tbl);
container->tables[i] = NULL;
}
}
static int tce_iommu_take_ownership(struct tce_container *container,
struct iommu_table_group *table_group)
{
int i, j, rc = 0;
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
struct iommu_table *tbl = table_group->tables[i];
if (!tbl || !tbl->it_map)
continue;
rc = iommu_take_ownership(tbl);
if (rc) {
for (j = 0; j < i; ++j)
iommu_release_ownership(
table_group->tables[j]);
return rc;
}
}
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
container->tables[i] = table_group->tables[i];
return 0;
}
static void tce_iommu_release_ownership_ddw(struct tce_container *container,
struct iommu_table_group *table_group)
{
long i;
if (!table_group->ops->unset_window) {
WARN_ON_ONCE(1);
return;
}
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
table_group->ops->unset_window(table_group, i);
table_group->ops->release_ownership(table_group);
}
static long tce_iommu_take_ownership_ddw(struct tce_container *container,
struct iommu_table_group *table_group)
{
if (!table_group->ops->create_table || !table_group->ops->set_window ||
!table_group->ops->release_ownership) {
WARN_ON_ONCE(1);
return -EFAULT;
}
table_group->ops->take_ownership(table_group);
return 0;
}
static int tce_iommu_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
int ret;
struct tce_container *container = iommu_data;
struct iommu_table_group *table_group;
struct tce_iommu_group *tcegrp = NULL;
mutex_lock(&container->lock);
/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
iommu_group_id(iommu_group), iommu_group); */
table_group = iommu_group_get_iommudata(iommu_group);
if (tce_groups_attached(container) && (!table_group->ops ||
!table_group->ops->take_ownership ||
!table_group->ops->release_ownership)) {
ret = -EBUSY;
goto unlock_exit;
}
/* Check if new group has the same iommu_ops (i.e. compatible) */
list_for_each_entry(tcegrp, &container->group_list, next) {
struct iommu_table_group *table_group_tmp;
if (tcegrp->grp == iommu_group) {
pr_warn("tce_vfio: Group %d is already attached\n",
iommu_group_id(iommu_group));
ret = -EBUSY;
goto unlock_exit;
}
table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
if (table_group_tmp->ops->create_table !=
table_group->ops->create_table) {
pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
iommu_group_id(iommu_group),
iommu_group_id(tcegrp->grp));
ret = -EPERM;
goto unlock_exit;
}
}
tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
if (!tcegrp) {
ret = -ENOMEM;
goto unlock_exit;
}
if (!table_group->ops || !table_group->ops->take_ownership ||
!table_group->ops->release_ownership) {
ret = tce_iommu_take_ownership(container, table_group);
} else {
ret = tce_iommu_take_ownership_ddw(container, table_group);
if (!tce_groups_attached(container) && !container->tables[0])
container->def_window_pending = true;
}
if (!ret) {
tcegrp->grp = iommu_group;
list_add(&tcegrp->next, &container->group_list);
}
unlock_exit:
if (ret && tcegrp)
kfree(tcegrp);
mutex_unlock(&container->lock);
return ret;
}
static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
struct tce_container *container = iommu_data;
struct iommu_table_group *table_group;
bool found = false;
struct tce_iommu_group *tcegrp;
mutex_lock(&container->lock);
list_for_each_entry(tcegrp, &container->group_list, next) {
if (tcegrp->grp == iommu_group) {
found = true;
break;
}
}
if (!found) {
pr_warn("tce_vfio: detaching unattached group #%u\n",
iommu_group_id(iommu_group));
goto unlock_exit;
}
list_del(&tcegrp->next);
kfree(tcegrp);
table_group = iommu_group_get_iommudata(iommu_group);
BUG_ON(!table_group);
if (!table_group->ops || !table_group->ops->release_ownership)
tce_iommu_release_ownership(container, table_group);
else
tce_iommu_release_ownership_ddw(container, table_group);
unlock_exit:
mutex_unlock(&container->lock);
}
const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
.name = "iommu-vfio-powerpc",
.owner = THIS_MODULE,
.open = tce_iommu_open,
.release = tce_iommu_release,
.ioctl = tce_iommu_ioctl,
.attach_group = tce_iommu_attach_group,
.detach_group = tce_iommu_detach_group,
};
static int __init tce_iommu_init(void)
{
return vfio_register_iommu_driver(&tce_iommu_driver_ops);
}
static void __exit tce_iommu_cleanup(void)
{
vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
}
module_init(tce_iommu_init);
module_exit(tce_iommu_cleanup);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);