kernel-ark/fs/nfs/objlayout/objlayout.c
Boaz Harrosh 5c0b4129c0 pnfs-obj: pNFS errors are communicated on iodata->pnfs_error
Some time along the way pNFS IO errors were switched to
communicate with a special iodata->pnfs_error member instead
of the regular RPC members. But objlayout was not switched
over.

Fix that!
Without this fix any IO error is hanged, because IO is not
switched to MDS and pages are never cleared or read.

[Applies to 3.2.0. Same bug different patch for 3.1/0 Kernels]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2012-01-06 08:55:23 -05:00

654 lines
17 KiB
C

/*
* pNFS Objects layout driver high level definitions
*
* Copyright (C) 2007 Panasas Inc. [year of first publication]
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
* Boaz Harrosh <bharrosh@panasas.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* See the file COPYING included with this distribution for more details.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Panasas company nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <scsi/osd_initiator.h>
#include "objlayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
/*
* Create a objlayout layout structure for the given inode and return it.
*/
struct pnfs_layout_hdr *
objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
{
struct objlayout *objlay;
objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
if (objlay) {
spin_lock_init(&objlay->lock);
INIT_LIST_HEAD(&objlay->err_list);
}
dprintk("%s: Return %p\n", __func__, objlay);
return &objlay->pnfs_layout;
}
/*
* Free an objlayout layout structure
*/
void
objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct objlayout *objlay = OBJLAYOUT(lo);
dprintk("%s: objlay %p\n", __func__, objlay);
WARN_ON(!list_empty(&objlay->err_list));
kfree(objlay);
}
/*
* Unmarshall layout and store it in pnfslay.
*/
struct pnfs_layout_segment *
objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
struct nfs4_layoutget_res *lgr,
gfp_t gfp_flags)
{
int status = -ENOMEM;
struct xdr_stream stream;
struct xdr_buf buf = {
.pages = lgr->layoutp->pages,
.page_len = lgr->layoutp->len,
.buflen = lgr->layoutp->len,
.len = lgr->layoutp->len,
};
struct page *scratch;
struct pnfs_layout_segment *lseg;
dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
scratch = alloc_page(gfp_flags);
if (!scratch)
goto err_nofree;
xdr_init_decode(&stream, &buf, NULL);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
if (unlikely(status)) {
dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
status);
goto err;
}
__free_page(scratch);
dprintk("%s: Return %p\n", __func__, lseg);
return lseg;
err:
__free_page(scratch);
err_nofree:
dprintk("%s: Err Return=>%d\n", __func__, status);
return ERR_PTR(status);
}
/*
* Free a layout segement
*/
void
objlayout_free_lseg(struct pnfs_layout_segment *lseg)
{
dprintk("%s: freeing layout segment %p\n", __func__, lseg);
if (unlikely(!lseg))
return;
objio_free_lseg(lseg);
}
/*
* I/O Operations
*/
static inline u64
end_offset(u64 start, u64 len)
{
u64 end;
end = start + len;
return end >= start ? end : NFS4_MAX_UINT64;
}
/* last octet in a range */
static inline u64
last_byte_offset(u64 start, u64 len)
{
u64 end;
BUG_ON(!len);
end = start + len;
return end > start ? end - 1 : NFS4_MAX_UINT64;
}
void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
struct page ***p_pages, unsigned *p_pgbase,
u64 offset, unsigned long count)
{
u64 lseg_end_offset;
BUG_ON(offset < lseg->pls_range.offset);
lseg_end_offset = end_offset(lseg->pls_range.offset,
lseg->pls_range.length);
BUG_ON(offset >= lseg_end_offset);
WARN_ON(offset + count > lseg_end_offset);
if (*p_pgbase > PAGE_SIZE) {
dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
*p_pages += *p_pgbase >> PAGE_SHIFT;
*p_pgbase &= ~PAGE_MASK;
}
}
/*
* I/O done common code
*/
static void
objlayout_iodone(struct objlayout_io_res *oir)
{
if (likely(oir->status >= 0)) {
objio_free_result(oir);
} else {
struct objlayout *objlay = oir->objlay;
spin_lock(&objlay->lock);
objlay->delta_space_valid = OBJ_DSU_INVALID;
list_add(&objlay->err_list, &oir->err_list);
spin_unlock(&objlay->lock);
}
}
/*
* objlayout_io_set_result - Set an osd_error code on a specific osd comp.
*
* The @index component IO failed (error returned from target). Register
* the error for later reporting at layout-return.
*/
void
objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
struct pnfs_osd_objid *pooid, int osd_error,
u64 offset, u64 length, bool is_write)
{
struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
BUG_ON(index >= oir->num_comps);
if (osd_error) {
ioerr->oer_component = *pooid;
ioerr->oer_comp_offset = offset;
ioerr->oer_comp_length = length;
ioerr->oer_iswrite = is_write;
ioerr->oer_errno = osd_error;
dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
"par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
__func__, index, ioerr->oer_errno,
ioerr->oer_iswrite,
_DEVID_LO(&ioerr->oer_component.oid_device_id),
_DEVID_HI(&ioerr->oer_component.oid_device_id),
ioerr->oer_component.oid_partition_id,
ioerr->oer_component.oid_object_id,
ioerr->oer_comp_offset,
ioerr->oer_comp_length);
} else {
/* User need not call if no error is reported */
ioerr->oer_errno = 0;
}
}
/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
* This is because the osd completion is called with ints-off from
* the block layer
*/
static void _rpc_read_complete(struct work_struct *work)
{
struct rpc_task *task;
struct nfs_read_data *rdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
rdata = container_of(task, struct nfs_read_data, task);
pnfs_ld_read_done(rdata);
}
void
objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
struct nfs_read_data *rdata = oir->rpcdata;
oir->status = rdata->task.tk_status = status;
if (status >= 0)
rdata->res.count = status;
else
rdata->pnfs_error = status;
objlayout_iodone(oir);
/* must not use oir after this point */
dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
status, rdata->res.eof, sync);
if (sync)
pnfs_ld_read_done(rdata);
else {
INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
schedule_work(&rdata->task.u.tk_work);
}
}
/*
* Perform sync or async reads.
*/
enum pnfs_try_status
objlayout_read_pagelist(struct nfs_read_data *rdata)
{
loff_t offset = rdata->args.offset;
size_t count = rdata->args.count;
int err;
loff_t eof;
eof = i_size_read(rdata->inode);
if (unlikely(offset + count > eof)) {
if (offset >= eof) {
err = 0;
rdata->res.count = 0;
rdata->res.eof = 1;
/*FIXME: do we need to call pnfs_ld_read_done() */
goto out;
}
count = eof - offset;
}
rdata->res.eof = (offset + count) >= eof;
_fix_verify_io_params(rdata->lseg, &rdata->args.pages,
&rdata->args.pgbase,
rdata->args.offset, rdata->args.count);
dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
__func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
err = objio_read_pagelist(rdata);
out:
if (unlikely(err)) {
rdata->pnfs_error = err;
dprintk("%s: Returned Error %d\n", __func__, err);
return PNFS_NOT_ATTEMPTED;
}
return PNFS_ATTEMPTED;
}
/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
* This is because the osd completion is called with ints-off from
* the block layer
*/
static void _rpc_write_complete(struct work_struct *work)
{
struct rpc_task *task;
struct nfs_write_data *wdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
wdata = container_of(task, struct nfs_write_data, task);
pnfs_ld_write_done(wdata);
}
void
objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
struct nfs_write_data *wdata = oir->rpcdata;
oir->status = wdata->task.tk_status = status;
if (status >= 0) {
wdata->res.count = status;
wdata->verf.committed = oir->committed;
} else {
wdata->pnfs_error = status;
}
objlayout_iodone(oir);
/* must not use oir after this point */
dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
status, wdata->verf.committed, sync);
if (sync)
pnfs_ld_write_done(wdata);
else {
INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
schedule_work(&wdata->task.u.tk_work);
}
}
/*
* Perform sync or async writes.
*/
enum pnfs_try_status
objlayout_write_pagelist(struct nfs_write_data *wdata,
int how)
{
int err;
_fix_verify_io_params(wdata->lseg, &wdata->args.pages,
&wdata->args.pgbase,
wdata->args.offset, wdata->args.count);
err = objio_write_pagelist(wdata, how);
if (unlikely(err)) {
wdata->pnfs_error = err;
dprintk("%s: Returned Error %d\n", __func__, err);
return PNFS_NOT_ATTEMPTED;
}
return PNFS_ATTEMPTED;
}
void
objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args)
{
struct objlayout *objlay = OBJLAYOUT(pnfslay);
struct pnfs_osd_layoutupdate lou;
__be32 *start;
dprintk("%s: Begin\n", __func__);
spin_lock(&objlay->lock);
lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
lou.dsu_delta = objlay->delta_space_used;
objlay->delta_space_used = 0;
objlay->delta_space_valid = OBJ_DSU_INIT;
lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
spin_unlock(&objlay->lock);
start = xdr_reserve_space(xdr, 4);
BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
*start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
lou.dsu_delta, lou.olu_ioerr_flag);
}
static int
err_prio(u32 oer_errno)
{
switch (oer_errno) {
case 0:
return 0;
case PNFS_OSD_ERR_RESOURCE:
return OSD_ERR_PRI_RESOURCE;
case PNFS_OSD_ERR_BAD_CRED:
return OSD_ERR_PRI_BAD_CRED;
case PNFS_OSD_ERR_NO_ACCESS:
return OSD_ERR_PRI_NO_ACCESS;
case PNFS_OSD_ERR_UNREACHABLE:
return OSD_ERR_PRI_UNREACHABLE;
case PNFS_OSD_ERR_NOT_FOUND:
return OSD_ERR_PRI_NOT_FOUND;
case PNFS_OSD_ERR_NO_SPACE:
return OSD_ERR_PRI_NO_SPACE;
default:
WARN_ON(1);
/* fallthrough */
case PNFS_OSD_ERR_EIO:
return OSD_ERR_PRI_EIO;
}
}
static void
merge_ioerr(struct pnfs_osd_ioerr *dest_err,
const struct pnfs_osd_ioerr *src_err)
{
u64 dest_end, src_end;
if (!dest_err->oer_errno) {
*dest_err = *src_err;
/* accumulated device must be blank */
memset(&dest_err->oer_component.oid_device_id, 0,
sizeof(dest_err->oer_component.oid_device_id));
return;
}
if (dest_err->oer_component.oid_partition_id !=
src_err->oer_component.oid_partition_id)
dest_err->oer_component.oid_partition_id = 0;
if (dest_err->oer_component.oid_object_id !=
src_err->oer_component.oid_object_id)
dest_err->oer_component.oid_object_id = 0;
if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
dest_err->oer_comp_offset = src_err->oer_comp_offset;
dest_end = end_offset(dest_err->oer_comp_offset,
dest_err->oer_comp_length);
src_end = end_offset(src_err->oer_comp_offset,
src_err->oer_comp_length);
if (dest_end < src_end)
dest_end = src_end;
dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
(err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
dest_err->oer_errno = src_err->oer_errno;
} else if (src_err->oer_iswrite) {
dest_err->oer_iswrite = true;
dest_err->oer_errno = src_err->oer_errno;
}
}
static void
encode_accumulated_error(struct objlayout *objlay, __be32 *p)
{
struct objlayout_io_res *oir, *tmp;
struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
unsigned i;
for (i = 0; i < oir->num_comps; i++) {
struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
if (!ioerr->oer_errno)
continue;
printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
"dev(%llx:%llx) par=0x%llx obj=0x%llx "
"offset=0x%llx length=0x%llx\n",
__func__, i, ioerr->oer_errno,
ioerr->oer_iswrite,
_DEVID_LO(&ioerr->oer_component.oid_device_id),
_DEVID_HI(&ioerr->oer_component.oid_device_id),
ioerr->oer_component.oid_partition_id,
ioerr->oer_component.oid_object_id,
ioerr->oer_comp_offset,
ioerr->oer_comp_length);
merge_ioerr(&accumulated_err, ioerr);
}
list_del(&oir->err_list);
objio_free_result(oir);
}
pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
}
void
objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args)
{
struct objlayout *objlay = OBJLAYOUT(pnfslay);
struct objlayout_io_res *oir, *tmp;
__be32 *start;
dprintk("%s: Begin\n", __func__);
start = xdr_reserve_space(xdr, 4);
BUG_ON(!start);
spin_lock(&objlay->lock);
list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
__be32 *last_xdr = NULL, *p;
unsigned i;
int res = 0;
for (i = 0; i < oir->num_comps; i++) {
struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
if (!ioerr->oer_errno)
continue;
dprintk("%s: err[%d]: errno=%d is_write=%d "
"dev(%llx:%llx) par=0x%llx obj=0x%llx "
"offset=0x%llx length=0x%llx\n",
__func__, i, ioerr->oer_errno,
ioerr->oer_iswrite,
_DEVID_LO(&ioerr->oer_component.oid_device_id),
_DEVID_HI(&ioerr->oer_component.oid_device_id),
ioerr->oer_component.oid_partition_id,
ioerr->oer_component.oid_object_id,
ioerr->oer_comp_offset,
ioerr->oer_comp_length);
p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
if (unlikely(!p)) {
res = -E2BIG;
break; /* accumulated_error */
}
last_xdr = p;
pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
}
/* TODO: use xdr_write_pages */
if (unlikely(res)) {
/* no space for even one error descriptor */
BUG_ON(!last_xdr);
/* we've encountered a situation with lots and lots of
* errors and no space to encode them all. Use the last
* available slot to report the union of all the
* remaining errors.
*/
encode_accumulated_error(objlay, last_xdr);
goto loop_done;
}
list_del(&oir->err_list);
objio_free_result(oir);
}
loop_done:
spin_unlock(&objlay->lock);
*start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return\n", __func__);
}
/*
* Get Device Info API for io engines
*/
struct objlayout_deviceinfo {
struct page *page;
struct pnfs_osd_deviceaddr da; /* This must be last */
};
/* Initialize and call nfs_getdeviceinfo, then decode and return a
* "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
* should be called.
*/
int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
gfp_t gfp_flags)
{
struct objlayout_deviceinfo *odi;
struct pnfs_device pd;
struct super_block *sb;
struct page *page, **pages;
u32 *p;
int err;
page = alloc_page(gfp_flags);
if (!page)
return -ENOMEM;
pages = &page;
pd.pages = pages;
memcpy(&pd.dev_id, d_id, sizeof(*d_id));
pd.layout_type = LAYOUT_OSD2_OBJECTS;
pd.pages = &page;
pd.pgbase = 0;
pd.pglen = PAGE_SIZE;
pd.mincount = 0;
sb = pnfslay->plh_inode->i_sb;
err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
if (err)
goto err_out;
p = page_address(page);
odi = kzalloc(sizeof(*odi), gfp_flags);
if (!odi) {
err = -ENOMEM;
goto err_out;
}
pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
odi->page = page;
*deviceaddr = &odi->da;
return 0;
err_out:
__free_page(page);
return err;
}
void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
{
struct objlayout_deviceinfo *odi = container_of(deviceaddr,
struct objlayout_deviceinfo,
da);
__free_page(odi->page);
kfree(odi);
}