kernel/pnfs-all-2.6.38-2011-03-25....

27008 lines
773 KiB
Diff

diff -up linux-2.6.38.noarch/Documentation/filesystems/Locking.orig linux-2.6.38.noarch/Documentation/filesystems/Locking
--- linux-2.6.38.noarch/Documentation/filesystems/Locking.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/Documentation/filesystems/Locking 2011-03-26 07:57:44.217821899 -0400
@@ -21,6 +21,7 @@ prototypes:
char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
struct vfsmount *(*d_automount)(struct path *path);
int (*d_manage)(struct dentry *, bool);
+ void (*d_unlink)(struct dentry *, struct dentry *);
locking rules:
rename_lock ->d_lock may block rcu-walk
@@ -33,6 +34,7 @@ d_iput: no no yes no
d_dname: no no no no
d_automount: no no yes no
d_manage: no no yes (ref-walk) maybe
+d_unlink no no yes no
--------------------------- inode_operations ---------------------------
prototypes:
diff -up linux-2.6.38.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.38.noarch/Documentation/filesystems/spnfs.txt
--- linux-2.6.38.noarch/Documentation/filesystems/spnfs.txt.orig 2011-03-26 07:57:44.217821899 -0400
+++ linux-2.6.38.noarch/Documentation/filesystems/spnfs.txt 2011-03-26 07:57:44.218821875 -0400
@@ -0,0 +1,211 @@
+(c) 2007 Network Appliance Inc.
+
+spNFS
+-----
+
+An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS).
+
+A file system is mounted by the clients from the MDS, and all file data
+is striped across the DSs.
+
+Identify the machines that will be filling each of these roles.
+
+The spnfs kernel will be installed on all machines: clients, the MDS and DSs.
+
+
+Building and installing the spNFS kernel
+----------------------------------------
+
+Get the spNFS kernel from:
+
+ git://linux-nfs.org/~bhalevy/linux-pnfs.git
+
+Use the pnfs-all-latest branch and add these options to your .config file
+
+ CONFIG_NETWORK_FILESYSTEMS=y
+ CONFIG_NFS_FS=m
+ CONFIG_NFS_V4=y
+ CONFIG_NFS_V4_1=y
+ CONFIG_PNFS=y
+ CONFIG_NFSD=m
+ CONFIG_PNFSD=y
+ # CONFIG_PNFSD_LOCAL_EXPORT is not set
+ CONFIG_SPNFS=y
+
+By default, spNFS uses whole-file layouts. Layout segments can be enabled
+by adding:
+
+ CONFIG_SPNFS_LAYOUTSEGMENTS=y
+
+to your .config file.
+
+Building and installation of kernel+modules is as usual.
+This kernel should be installed and booted on the client, MDS and DSs.
+
+Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it
+takes over the pnfs export interface.
+
+Building nfs-utils
+------------------
+
+Get the nfs-utils package containing spnfsd from:
+
+ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git
+
+Follow the standard instructions for building nfs-utils.
+
+After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd
+daemon will only be needed on the MDS.
+
+
+Installation
+------------
+
+The nfs-utils package contains a default spnfsd.conf file in
+utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf.
+
+By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under
+this directory, mount points must be created for each DS to
+be used for pNFS data stripes. These mount points are named by the ip address
+of the corresponding DS. In the sample spnfsd.conf, there are two
+DSs defined (172.16.28.134 and 172.16.28.141).
+
+Following the sample spnfsd.conf,
+
+ mkdir /spnfs
+
+on the MDS (corresponding to DS-Mount-Directory). Then
+
+ mkdir /spnfs/172.16.28.134
+ mkdir /spnfs/172.16.28.141
+
+to create the mount points for the DSs.
+
+On the DSs, chose a directory where data stripes will be created by the MDS.
+For the sample file, this directory is /pnfs, so on each DS execute:
+
+ mkdir /pnfs
+
+This directory is specified in the spnfsd.conf file by the DS*_ROOT option
+(where * is replaced by the DS number). DS_ROOT is specified relative to
+the directory being exported by the DSs. In our example, our DSs are exporting
+the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have
+the following entry in /etc/exports:
+
+ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check)
+
+N.B. If we had created a /exports directory and a /pnfs directory under
+/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs
+(not /exports/pnfs).
+
+It may be useful to add entries to /etc/fstab on the MDS to automatically
+mount the DS_ROOT file systems. For this example, our MDS fstab would
+contain:
+
+ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2
+ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2
+
+The DS mounts must be performed manually or via fstab at this time (automatic
+mounting, directory creation, etc. are on the todo list). To perform I/O
+through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction
+will eventually be removed).
+
+
+On the MDS, choose a file system to use with spNFS and export it, e.g.:
+
+ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs)
+
+Make sure nfsd and all supporting processes are running on the MDS and DSs.
+
+
+Running
+-------
+
+If rpc_pipefs is not already mounted (if you're running idmapd it probably is),
+you may want to add the following line to /etc/fstab:
+
+ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0
+
+to automatically mount rpc_pipefs.
+
+With spnfsd.conf configured for your environment and the mounts mounted as
+described above, spnfsd can now be started.
+
+On the MDS, execute spnfsd:
+
+ spnfsd
+
+The executable is located in the directory where it was built, and
+may also have been installed elsewhere depending on how you built nfs-utils.
+It will run in the foreground by default, and in fact will do so despite
+any options suggesting the contrary (it's still a debugging build).
+
+On the client, make sure the nfslayoutdriver module is loaded:
+
+ modprobe nfslayoutdriver
+
+Then mount the file system from the MDS:
+
+ mount -t nfs4 -o minorversion=1 mds:/ /mnt
+
+I/O through the MDS is now supported. To use it, do not load the
+nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1
+(NFSv2 and v3 are not yet supported).
+
+You may now use spNFS by performing file system activities in /mnt.
+If you create files in /mnt, you should see stripe files corresponding to
+new files being created on the DSs. The current implementation names the
+stripe files based on the inode number of the file on the MDS. For example,
+if you create a file foo in /mnt and do an 'ls -li /mnt/foo':
+
+ # ls -li foo
+ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo
+
+You should see stripe files on each under /pnfs (per the sample) named
+1233. The file /pnfs/1233 on DS1 will contain the first <stripe size> bytes
+of data written to foo, DS2 will contain the next <stripe size> bytes, etc.
+Removing /mnt/foo will remove the corresponding stripe files on the DSs.
+Other file system operations should behave (mostly :-) as expected.
+
+
+Layout Segments
+---------------
+
+If the kernel is compiled to support layout segments, there will
+be two files created under /proc/fs/spnfs for controlling layout
+segment functionality.
+
+To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.:
+
+ echo 1 > /proc/fs/spnfs/layoutseg
+
+Layout segments can be disabled (returning to whole-file layouts) by
+writing a '0' to /proc/fs/spnfs/layoutseg:
+
+ echo 0 > /proc/fs/spnfs/layoutseg
+
+When layout segments are enabled, the size of the layouts returned can
+be specified by writing a decimal number (ascii representation) to
+/proc/fs/spnfs/layoutsegsize:
+
+ echo 1024 > /proc/fs/spnfs/layoutsegsize
+
+The value'0' has a special meaning--it causes the server to return a
+layout that is exactly the size requested by the client:
+
+ echo 0 > /proc/fs/spnfs/layoutsegsize
+
+
+Troubleshooting
+---------------
+
+If you see data being written to the files on the MDS rather than
+the stripe files, make sure the nfslayoutdriver is loaded on the client
+(see above).
+
+If you get a "permission denied" error, make sure mountd is running on the mds
+(it occasionally fails to start).
+
+Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com
+
+
diff -up linux-2.6.38.noarch/Documentation/filesystems/vfs.txt.orig linux-2.6.38.noarch/Documentation/filesystems/vfs.txt
--- linux-2.6.38.noarch/Documentation/filesystems/vfs.txt.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/Documentation/filesystems/vfs.txt 2011-03-26 07:57:44.218821875 -0400
@@ -866,6 +866,7 @@ struct dentry_operations {
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(struct dentry *, bool, bool);
+ void (*d_unlink)(struct dentry *, struct dentry *);
};
d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -973,6 +974,14 @@ struct dentry_operations {
This function is only used if DCACHE_MANAGE_TRANSIT is set on the
dentry being transited from.
+ d_unlink: called to allow the filesystem to unlink the dentry after final
+ use. It is only called when DCACHE_NFSFS_RENAMED is set, and is
+ designed for use by 'sillyrename' schemes that are commonly
+ implemented on distributed filesystems such as NFS.
+
+ Note that the filesystem is still responsible for protecting against
+ races with other lookups.
+
Example :
static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
diff -up linux-2.6.38.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.38.noarch/drivers/md/dm-ioctl.c
--- linux-2.6.38.noarch/drivers/md/dm-ioctl.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/drivers/md/dm-ioctl.c 2011-03-26 07:57:44.220821839 -0400
@@ -713,6 +713,12 @@ static int dev_create(struct dm_ioctl *p
return 0;
}
+int dm_dev_create(struct dm_ioctl *param)
+{
+ return dev_create(param, sizeof(*param));
+}
+EXPORT_SYMBOL(dm_dev_create);
+
/*
* Always use UUID for lookups if it's present, otherwise use name or dev.
*/
@@ -808,6 +814,12 @@ static int dev_remove(struct dm_ioctl *p
return 0;
}
+int dm_dev_remove(struct dm_ioctl *param)
+{
+ return dev_remove(param, sizeof(*param));
+}
+EXPORT_SYMBOL(dm_dev_remove);
+
/*
* Check a string doesn't overrun the chunk of
* memory we copied from userland.
@@ -990,6 +1002,12 @@ static int do_resume(struct dm_ioctl *pa
return r;
}
+int dm_do_resume(struct dm_ioctl *param)
+{
+ return do_resume(param);
+}
+EXPORT_SYMBOL(dm_do_resume);
+
/*
* Set or unset the suspension state of a device.
* If the device already is in the requested state we just return its status.
@@ -1256,6 +1274,12 @@ out:
return r;
}
+int dm_table_load(struct dm_ioctl *param, size_t param_size)
+{
+ return table_load(param, param_size);
+}
+EXPORT_SYMBOL(dm_table_load);
+
static int table_clear(struct dm_ioctl *param, size_t param_size)
{
struct hash_cell *hc;
diff -up linux-2.6.38.noarch/drivers/scsi/hosts.c.orig linux-2.6.38.noarch/drivers/scsi/hosts.c
--- linux-2.6.38.noarch/drivers/scsi/hosts.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/drivers/scsi/hosts.c 2011-03-26 07:57:44.221821816 -0400
@@ -50,10 +50,11 @@ static void scsi_host_cls_release(struct
put_device(&class_to_shost(dev)->shost_gendev);
}
-static struct class shost_class = {
+struct class shost_class = {
.name = "scsi_host",
.dev_release = scsi_host_cls_release,
};
+EXPORT_SYMBOL(shost_class);
/**
* scsi_host_set_state - Take the given host through the host state model.
diff -up linux-2.6.38.noarch/fs/dcache.c.orig linux-2.6.38.noarch/fs/dcache.c
--- linux-2.6.38.noarch/fs/dcache.c.orig 2011-03-26 07:53:05.384187507 -0400
+++ linux-2.6.38.noarch/fs/dcache.c 2011-03-26 07:57:44.223821768 -0400
@@ -305,6 +305,9 @@ static struct dentry *d_kill(struct dent
if (parent)
spin_unlock(&parent->d_lock);
dentry_iput(dentry);
+
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ dentry->d_op->d_unlink(parent, dentry);
/*
* dentry_iput drops the locks, at which point nobody (except
* transient RCU lookups) can reach this dentry.
@@ -2075,6 +2078,8 @@ again:
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
fsnotify_nameremove(dentry, isdir);
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ dentry->d_op->d_unlink(dentry->d_parent, dentry);
return;
}
diff -up linux-2.6.38.noarch/fs/exofs/exofs.h.orig linux-2.6.38.noarch/fs/exofs/exofs.h
--- linux-2.6.38.noarch/fs/exofs/exofs.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/exofs/exofs.h 2011-03-26 07:57:44.225821735 -0400
@@ -36,13 +36,9 @@
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/backing-dev.h>
+#include <linux/pnfs_osd_xdr.h>
#include "common.h"
-/* FIXME: Remove once pnfs hits mainline
- * #include <linux/exportfs/pnfs_osd_xdr.h>
- */
-#include "pnfs.h"
-
#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
#ifdef CONFIG_EXOFS_DEBUG
@@ -103,6 +99,7 @@ struct exofs_sb_info {
struct exofs_i_info {
struct inode vfs_inode; /* normal in-memory inode */
wait_queue_head_t i_wq; /* wait queue for inode */
+ spinlock_t i_layout_lock; /* lock for layout/return/recall */
unsigned long i_flags; /* various atomic flags */
uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
uint32_t i_dir_start_lookup; /* which page to start lookup */
@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si
*/
#define OBJ_2BCREATED 0 /* object will be created soon*/
#define OBJ_CREATED 1 /* object has been created on the osd*/
+/* Below are not used atomic but reuse the same i_flags */
+#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/
+#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/
static inline int obj_2bcreated(struct exofs_i_info *oi)
{
@@ -303,4 +303,21 @@ extern const struct inode_operations exo
extern const struct inode_operations exofs_symlink_inode_operations;
extern const struct inode_operations exofs_fast_symlink_inode_operations;
+/* export.c */
+typedef int (exofs_recall_fn)(struct inode *inode, u64 data);
+#ifdef CONFIG_PNFSD
+int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode,
+ exofs_recall_fn todo, u64 todo_data);
+void exofs_init_export(struct super_block *sb);
+#else
+static inline int
+exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode,
+exofs_recall_fn todo, u64 todo_data)
+{
+ return todo(inode, todo_data);
+}
+
+static inline void exofs_init_export(struct super_block *sb) {}
+#endif
+
#endif
diff -up linux-2.6.38.noarch/fs/exofs/export.c.orig linux-2.6.38.noarch/fs/exofs/export.c
--- linux-2.6.38.noarch/fs/exofs/export.c.orig 2011-03-26 07:57:44.226821719 -0400
+++ linux-2.6.38.noarch/fs/exofs/export.c 2011-03-26 07:57:44.226821719 -0400
@@ -0,0 +1,396 @@
+/*
+ * export.c - Implementation of the pnfs_export_operations
+ *
+ * Copyright (C) 2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/nfsd/nfsd4_pnfs.h>
+#include "exofs.h"
+
+static int exofs_layout_type(struct super_block *sb)
+{
+ return LAYOUT_OSD2_OBJECTS;
+}
+
+static void set_dev_id(struct nfs4_deviceid *pnfs_devid, u64 sbid, u64 devid)
+{
+ struct nfsd4_pnfs_deviceid *dev_id =
+ (struct nfsd4_pnfs_deviceid *)pnfs_devid;
+
+ dev_id->sbid = sbid;
+ dev_id->devid = devid;
+}
+
+static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode,
+ u64 offset, u64 length, void *cookie)
+{
+ struct nfsd4_pnfs_cb_layout cbl;
+ struct pnfsd_cb_ctl cb_ctl;
+ int status;
+
+ memset(&cb_ctl, 0, sizeof(cb_ctl));
+ status = pnfsd_get_cb_op(&cb_ctl);
+ if (unlikely(status)) {
+ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n",
+ __func__, inode->i_ino, status);
+ goto err;
+ }
+
+ memset(&cbl, 0, sizeof(cbl));
+ cbl.cbl_recall_type = RETURN_FILE;
+ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS;
+ cbl.cbl_seg.iomode = iomode;
+ cbl.cbl_seg.offset = offset;
+ cbl.cbl_seg.length = length;
+ cbl.cbl_cookie = cookie;
+
+ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl);
+ pnfsd_put_cb_op(&cb_ctl);
+
+err:
+ return status;
+}
+
+static enum nfsstat4 exofs_layout_get(
+ struct inode *inode,
+ struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_layoutget_arg *args,
+ struct nfsd4_pnfs_layoutget_res *res)
+{
+ struct exofs_i_info *oi = exofs_i(inode);
+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct exofs_layout *el = &sbi->layout;
+ struct pnfs_osd_object_cred *creds = NULL;
+ struct pnfs_osd_layout layout;
+ __be32 *start;
+ bool in_recall;
+ int i, err;
+ enum nfsstat4 nfserr;
+
+ res->lg_seg.offset = 0;
+ res->lg_seg.length = NFS4_MAX_UINT64;
+ res->lg_seg.iomode = IOMODE_RW;
+ res->lg_return_on_close = true; /* TODO: unused but will be soon */
+
+ /* skip opaque size, will be filled-in later */
+ start = exp_xdr_reserve_qwords(xdr, 1);
+ if (!start) {
+ nfserr = NFS4ERR_TOOSMALL;
+ goto out;
+ }
+
+ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL);
+ if (!creds) {
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ goto out;
+ }
+
+ /* Fill in a pnfs_osd_layout struct */
+ layout.olo_map = sbi->data_map;
+
+ for (i = 0; i < el->s_numdevs; i++) {
+ struct pnfs_osd_object_cred *cred = &creds[i];
+ osd_id id = exofs_oi_objno(oi);
+ unsigned dev = exofs_layout_od_id(el, id, i);
+
+ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid,
+ dev);
+ cred->oc_object_id.oid_partition_id = el->s_pid;
+ cred->oc_object_id.oid_object_id = id;
+ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ?
+ PNFS_OSD_VERSION_1 :
+ PNFS_OSD_VERSION_2;
+ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE;
+
+ cred->oc_cap_key.cred_len = 0;
+ cred->oc_cap_key.cred = NULL;
+
+ cred->oc_cap.cred_len = OSD_CAP_LEN;
+ cred->oc_cap.cred = oi->i_cred;
+ }
+
+ layout.olo_comps_index = 0;
+ layout.olo_num_comps = el->s_numdevs;
+ layout.olo_comps = creds;
+
+ err = pnfs_osd_xdr_encode_layout(xdr, &layout);
+ if (err) {
+ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */
+ goto out;
+ }
+
+ exp_xdr_encode_opaque_len(start, xdr->p);
+
+ spin_lock(&oi->i_layout_lock);
+ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags);
+ if (!in_recall) {
+ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags);
+ nfserr = NFS4_OK;
+ } else {
+ nfserr = NFS4ERR_RECALLCONFLICT;
+ }
+ spin_unlock(&oi->i_layout_lock);
+
+out:
+ kfree(creds);
+ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n",
+ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start));
+ return nfserr;
+}
+
+/* NOTE: inode mutex must NOT be held */
+static int exofs_layout_commit(
+ struct inode *inode,
+ const struct nfsd4_pnfs_layoutcommit_arg *args,
+ struct nfsd4_pnfs_layoutcommit_res *res)
+{
+ struct exofs_i_info *oi = exofs_i(inode);
+ struct timespec mtime;
+ loff_t i_size;
+ int in_recall;
+
+ /* In case of a recall we ignore the new size and mtime since they
+ * are going to be changed again by truncate, and since we cannot take
+ * the inode lock in that case.
+ */
+ spin_lock(&oi->i_layout_lock);
+ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags);
+ spin_unlock(&oi->i_layout_lock);
+ if (in_recall) {
+ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n",
+ inode->i_ino);
+ return 0;
+ }
+
+ /* NOTE: I would love to call inode_setattr here
+ * but i cannot since this will cause an eventual vmtruncate,
+ * which will cause a layout_recall. So open code the i_size
+ * and mtime/atime changes under i_mutex.
+ */
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL);
+
+ if (args->lc_mtime.seconds) {
+ mtime.tv_sec = args->lc_mtime.seconds;
+ mtime.tv_nsec = args->lc_mtime.nseconds;
+
+ /* layout commit may only make time bigger, since there might
+ * be reordering of the notifications and it might arrive after
+ * A local change.
+ * TODO: if mtime > ctime then we know set_attr did an mtime
+ * in the future. and we can let this update through
+ */
+ if (0 <= timespec_compare(&mtime, &inode->i_mtime))
+ mtime = inode->i_mtime;
+ } else {
+ mtime = current_fs_time(inode->i_sb);
+ }
+
+ /* TODO: Will below work? since mark_inode_dirty has it's own
+ * Time handling
+ */
+ inode->i_atime = inode->i_mtime = mtime;
+
+ i_size = i_size_read(inode);
+ if (args->lc_newoffset) {
+ loff_t new_size = args->lc_last_wr + 1;
+
+ if (i_size < new_size) {
+ i_size_write(inode, i_size = new_size);
+ res->lc_size_chg = 1;
+ res->lc_newsize = new_size;
+ }
+ }
+ /* TODO: else { i_size = osd_get_object_length() } */
+
+/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */
+
+ mark_inode_dirty_sync(inode);
+
+ mutex_unlock(&inode->i_mutex);
+ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n",
+ inode->i_ino, i_size, args->lc_last_wr);
+ return 0;
+}
+
+static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr)
+{
+ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ ioerr->oer_errno, ioerr->oer_iswrite,
+ _LLU(ioerr->oer_component.oid_object_id),
+ _LLU(ioerr->oer_comp_offset),
+ _LLU(ioerr->oer_comp_length));
+}
+
+static int exofs_layout_return(
+ struct inode *inode,
+ const struct nfsd4_pnfs_layoutreturn_arg *args)
+{
+ __be32 *p = args->lrf_body;
+ unsigned len = exp_xdr_qwords(args->lrf_body_len);
+
+ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n",
+ inode->i_ino, args->lr_cookie, len);
+
+ while (len >= pnfs_osd_ioerr_xdr_sz()) {
+ struct pnfs_osd_ioerr ioerr;
+
+ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p);
+ len -= pnfs_osd_ioerr_xdr_sz();
+ exofs_handle_error(&ioerr);
+ }
+
+ if (args->lr_cookie) {
+ struct exofs_i_info *oi = exofs_i(inode);
+ bool in_recall;
+
+ spin_lock(&oi->i_layout_lock);
+ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags);
+ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags);
+ spin_unlock(&oi->i_layout_lock);
+
+ /* TODO: how to communicate cookie with the waiter */
+ if (in_recall)
+ wake_up(&oi->i_wq); /* wakeup any recalls */
+ }
+
+ return 0;
+}
+
+int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr,
+ u32 layout_type,
+ const struct nfsd4_pnfs_deviceid *devid)
+{
+ struct exofs_sb_info *sbi = sb->s_fs_info;
+ struct pnfs_osd_deviceaddr devaddr;
+ const struct osd_dev_info *odi;
+ u64 devno = devid->devid;
+ __be32 *start;
+ int err;
+
+ memset(&devaddr, 0, sizeof(devaddr));
+
+ if (unlikely(devno >= sbi->layout.s_numdevs))
+ return -ENODEV;
+
+ odi = osduld_device_info(sbi->layout.s_ods[devno]);
+
+ devaddr.oda_systemid.len = odi->systemid_len;
+ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */
+
+ devaddr.oda_osdname.len = odi->osdname_len ;
+ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */
+
+ /* skip opaque size, will be filled-in later */
+ start = exp_xdr_reserve_qwords(xdr, 1);
+ if (!start) {
+ err = -E2BIG;
+ goto err;
+ }
+
+ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr);
+ if (err)
+ goto err;
+
+ exp_xdr_encode_opaque_len(start, xdr->p);
+
+ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n",
+ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname);
+ return 0;
+
+err:
+ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n",
+ err, exp_xdr_qbytes(xdr->p - start));
+ return err;
+}
+
+struct pnfs_export_operations exofs_pnfs_ops = {
+ .layout_type = exofs_layout_type,
+ .layout_get = exofs_layout_get,
+ .layout_commit = exofs_layout_commit,
+ .layout_return = exofs_layout_return,
+ .get_device_info = exofs_get_device_info,
+};
+
+static bool is_layout_returned(struct exofs_i_info *oi)
+{
+ bool layout_given;
+
+ spin_lock(&oi->i_layout_lock);
+ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags);
+ spin_unlock(&oi->i_layout_lock);
+
+ return !layout_given;
+}
+
+int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode,
+ exofs_recall_fn todo, u64 todo_data)
+{
+ struct exofs_i_info *oi = exofs_i(inode);
+ int layout_given;
+ int error = 0;
+
+ spin_lock(&oi->i_layout_lock);
+ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags);
+ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags);
+ spin_unlock(&oi->i_layout_lock);
+
+ if (!layout_given)
+ goto exec;
+
+ for (;;) {
+ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n",
+ inode->i_ino);
+ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64,
+ &oi->i_wq);
+ switch (error) {
+ case 0:
+ case -EAGAIN:
+ break;
+ case -ENOENT:
+ goto exec;
+ default:
+ goto err;
+ }
+
+ error = wait_event_interruptible(oi->i_wq,
+ is_layout_returned(oi));
+ if (error)
+ goto err;
+ }
+
+exec:
+ error = todo(inode, todo_data);
+
+err:
+ spin_lock(&oi->i_layout_lock);
+ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags);
+ spin_unlock(&oi->i_layout_lock);
+ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error);
+ return error;
+}
+
+void exofs_init_export(struct super_block *sb)
+{
+ sb->s_pnfs_op = &exofs_pnfs_ops;
+}
diff -up linux-2.6.38.noarch/fs/exofs/inode.c.orig linux-2.6.38.noarch/fs/exofs/inode.c
--- linux-2.6.38.noarch/fs/exofs/inode.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/exofs/inode.c 2011-03-26 07:57:44.227821703 -0400
@@ -820,8 +820,9 @@ static inline int exofs_inode_is_fast_sy
const struct osd_attr g_attr_logical_length = ATTR_DEF(
OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-static int _do_truncate(struct inode *inode, loff_t newsize)
+static int _do_truncate(struct inode *inode, u64 data)
{
+ loff_t newsize = data;
struct exofs_i_info *oi = exofs_i(inode);
int ret;
@@ -858,7 +859,8 @@ int exofs_setattr(struct dentry *dentry,
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
- error = _do_truncate(inode, iattr->ia_size);
+ error = exofs_inode_recall_layout(inode, IOMODE_ANY,
+ _do_truncate, iattr->ia_size);
if (unlikely(error))
return error;
}
@@ -971,6 +973,7 @@ static void __oi_init(struct exofs_i_inf
{
init_waitqueue_head(&oi->i_wq);
oi->i_flags = 0;
+ spin_lock_init(&oi->i_layout_lock);
}
/*
* Fill in an inode read from the OSD and set it up for use
diff -up linux-2.6.38.noarch/fs/exofs/Kbuild.orig linux-2.6.38.noarch/fs/exofs/Kbuild
--- linux-2.6.38.noarch/fs/exofs/Kbuild.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/exofs/Kbuild 2011-03-26 07:57:44.224821753 -0400
@@ -13,4 +13,5 @@
#
exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
+exofs-$(CONFIG_PNFSD) += export.o
obj-$(CONFIG_EXOFS_FS) += exofs.o
diff -up linux-2.6.38.noarch/fs/exofs/Kconfig.orig linux-2.6.38.noarch/fs/exofs/Kconfig
--- linux-2.6.38.noarch/fs/exofs/Kconfig.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/exofs/Kconfig 2011-03-26 07:57:44.224821753 -0400
@@ -1,6 +1,7 @@
config EXOFS_FS
tristate "exofs: OSD based file system support"
depends on SCSI_OSD_ULD
+ select EXPORTFS_OSD_LAYOUT if PNFSD
help
EXOFS is a file system that uses an OSD storage device,
as its backing storage.
diff -up linux-2.6.38.noarch/fs/exofs/super.c.orig linux-2.6.38.noarch/fs/exofs/super.c
--- linux-2.6.38.noarch/fs/exofs/super.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/exofs/super.c 2011-03-26 07:57:44.229821686 -0400
@@ -627,6 +627,7 @@ static int exofs_fill_super(struct super
sb->s_fs_info = sbi;
sb->s_op = &exofs_sops;
sb->s_export_op = &exofs_export_ops;
+ exofs_init_export(sb);
root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
if (IS_ERR(root)) {
EXOFS_ERR("ERROR: exofs_iget failed\n");
diff -up linux-2.6.38.noarch/fs/exportfs/expfs.c.orig linux-2.6.38.noarch/fs/exportfs/expfs.c
--- linux-2.6.38.noarch/fs/exportfs/expfs.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/exportfs/expfs.c 2011-03-26 07:57:44.230821684 -0400
@@ -16,6 +16,13 @@
#include <linux/namei.h>
#include <linux/sched.h>
+#if defined(CONFIG_PNFSD)
+struct pnfsd_cb_ctl pnfsd_cb_ctl = {
+ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock)
+};
+EXPORT_SYMBOL(pnfsd_cb_ctl);
+#endif /* CONFIG_PNFSD */
+
#define dprintk(fmt, args...) do{}while(0)
diff -up linux-2.6.38.noarch/fs/exportfs/Makefile.orig linux-2.6.38.noarch/fs/exportfs/Makefile
--- linux-2.6.38.noarch/fs/exportfs/Makefile.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/exportfs/Makefile 2011-03-26 07:57:44.229821686 -0400
@@ -3,4 +3,7 @@
obj-$(CONFIG_EXPORTFS) += exportfs.o
-exportfs-objs := expfs.o
+exportfs-y := expfs.o
+exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o
+exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o
+exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o
diff -up linux-2.6.38.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.38.noarch/fs/exportfs/nfs4blocklayoutxdr.c
--- linux-2.6.38.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2011-03-26 07:57:44.230821684 -0400
+++ linux-2.6.38.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2011-03-26 07:57:44.230821684 -0400
@@ -0,0 +1,158 @@
+/*
+ * linux/fs/nfsd/nfs4blocklayoutxdr.c
+ *
+ *
+ * Created by Rick McNeal on 3/31/08.
+ * Copyright 2008 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <linux/module.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+#include <linux/nfsd/nfs4layoutxdr.h>
+
+static int
+bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld)
+{
+ __be32 *p = exp_xdr_reserve_space(xdr,
+ 12 + 4 + bld->u.simple.bld_sig_len);
+
+ if (!p)
+ return -ETOOSMALL;
+
+ p = exp_xdr_encode_u32(p, 1);
+ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset);
+ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig,
+ bld->u.simple.bld_sig_len);
+
+ return 0;
+}
+
+static int
+bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld)
+{
+ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1);
+
+ if (!p)
+ return -ETOOSMALL;
+
+ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start);
+ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len);
+ exp_xdr_encode_u32(p, bld->u.slice.bld_index);
+
+ return 0;
+}
+
+static int
+bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld)
+{
+ return -ENOTSUPP;
+}
+
+static int
+bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld)
+{
+ int i;
+ __be32 *p = exp_xdr_reserve_space(xdr,
+ 2 + 1 + bld->u.stripe.bld_stripes);
+
+ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size);
+ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes);
+ for (i = 0; i < bld->u.stripe.bld_stripes; i++)
+ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]);
+
+ return 0;
+}
+
+int
+blocklayout_encode_devinfo(struct exp_xdr_stream *xdr,
+ const struct list_head *volumes)
+{
+ u32 num_vols = 0,
+ *layoutlen_p = xdr->p;
+ pnfs_blocklayout_devinfo_t *bld;
+ int status = 0;
+ __be32 *p;
+
+ p = exp_xdr_reserve_qwords(xdr, 2);
+ if (!p)
+ return -ETOOSMALL;
+ p += 2;
+
+ /*
+ * All simple volumes with their signature are required to be listed
+ * first.
+ */
+ list_for_each_entry(bld, volumes, bld_list) {
+ num_vols++;
+ p = exp_xdr_reserve_qwords(xdr, 1);
+ if (!p)
+ return -ETOOSMALL;
+ p = exp_xdr_encode_u32(p, bld->bld_type);
+ switch (bld->bld_type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ status = bl_encode_simple(xdr, bld);
+ break;
+ case PNFS_BLOCK_VOLUME_SLICE:
+ status = bl_encode_slice(xdr, bld);
+ break;
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ status = bl_encode_concat(xdr, bld);
+ break;
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ status = bl_encode_stripe(xdr, bld);
+ break;
+ default:
+ BUG();
+ }
+ if (status)
+ goto error;
+ }
+
+ /* ---- Fill in the overall length and number of volumes ---- */
+ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4);
+ exp_xdr_encode_u32(p, num_vols);
+
+error:
+ return status;
+}
+EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo);
+
+enum nfsstat4
+blocklayout_encode_layout(struct exp_xdr_stream *xdr,
+ const struct list_head *bl_head)
+{
+ struct pnfs_blocklayout_layout *b;
+ u32 *layoutlen_p = xdr->p,
+ extents = 0;
+ __be32 *p;
+
+ /*
+ * Save spot for opaque block layout length and number of extents,
+ * fill-in later.
+ */
+ p = exp_xdr_reserve_qwords(xdr, 2);
+ if (!p)
+ return NFS4ERR_TOOSMALL;
+ p += 2;
+
+ list_for_each_entry(b, bl_head, bll_list) {
+ extents++;
+ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1);
+ if (!p)
+ return NFS4ERR_TOOSMALL;
+ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid);
+ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid);
+ p = exp_xdr_encode_u64(p, b->bll_foff);
+ p = exp_xdr_encode_u64(p, b->bll_len);
+ p = exp_xdr_encode_u64(p, b->bll_soff);
+ p = exp_xdr_encode_u32(p, b->bll_es);
+ }
+
+ /* ---- Fill in the overall length and number of extents ---- */
+ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4);
+ exp_xdr_encode_u32(p, extents);
+
+ return NFS4_OK;
+}
+EXPORT_SYMBOL_GPL(blocklayout_encode_layout);
diff -up linux-2.6.38.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.38.noarch/fs/exportfs/nfs4filelayoutxdr.c
--- linux-2.6.38.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2011-03-26 07:57:44.231821680 -0400
+++ linux-2.6.38.noarch/fs/exportfs/nfs4filelayoutxdr.c 2011-03-26 07:57:44.231821680 -0400
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/exp_xdr.h>
+#include <linux/module.h>
+#include <linux/nfs4.h>
+#include <linux/nfsd/nfsfh.h>
+#include <linux/nfsd/nfs4layoutxdr.h>
+
+/* We do our-own dprintk so filesystems are not dependent on sunrpc */
+#ifdef dprintk
+#undef dprintk
+#endif
+#define dprintk(fmt, args, ...) do { } while (0)
+
+/* Calculate the XDR length of the GETDEVICEINFO4resok structure
+ * excluding the gdir_notification and the gdir_device_addr da_layout_type.
+ */
+static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev)
+{
+ struct pnfs_filelayout_devaddr *fl_addr;
+ struct pnfs_filelayout_multipath *mp;
+ int i, j, nwords;
+
+ /* da_addr_body length, indice length, indices,
+ * multipath_list4 length */
+ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1;
+ for (i = 0; i < fdev->fl_device_length; i++) {
+ mp = &fdev->fl_device_list[i];
+ nwords++; /* multipath list length */
+ for (j = 0; j < mp->fl_multipath_length; j++) {
+ fl_addr = mp->fl_multipath_list;
+ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len);
+ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len);
+ }
+ }
+ dprintk("<-- %s nwords %d\n", __func__, nwords);
+ return nwords;
+}
+
+/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13
+ * on the response stream.
+ * Use linux error codes (not nfs) since these values are being
+ * returned to the file system.
+ */
+int
+filelayout_encode_devinfo(struct exp_xdr_stream *xdr,
+ const struct pnfs_filelayout_device *fdev)
+{
+ unsigned int i, j, len = 0, opaque_words;
+ u32 *p_in;
+ u32 index_count = fdev->fl_stripeindices_length;
+ u32 dev_count = fdev->fl_device_length;
+ int error = 0;
+ __be32 *p;
+
+ opaque_words = fl_devinfo_xdr_words(fdev);
+ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n",
+ __func__,
+ index_count,
+ dev_count,
+ opaque_words*4);
+
+ /* check space for opaque length */
+ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words);
+ if (!p) {
+ error = -ETOOSMALL;
+ goto out;
+ }
+
+ /* Fill in length later */
+ p++;
+
+ /* encode device list indices */
+ p = exp_xdr_encode_u32(p, index_count);
+ for (i = 0; i < index_count; i++)
+ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]);
+
+ /* encode device list */
+ p = exp_xdr_encode_u32(p, dev_count);
+ for (i = 0; i < dev_count; i++) {
+ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i];
+
+ p = exp_xdr_encode_u32(p, mp->fl_multipath_length);
+ for (j = 0; j < mp->fl_multipath_length; j++) {
+ struct pnfs_filelayout_devaddr *da =
+ &mp->fl_multipath_list[j];
+
+ /* Encode device info */
+ p = exp_xdr_encode_opaque(p, da->r_netid.data,
+ da->r_netid.len);
+ p = exp_xdr_encode_opaque(p, da->r_addr.data,
+ da->r_addr.len);
+ }
+ }
+
+ /* backfill in length. Subtract 4 for da_addr_body size */
+ len = (char *)p - (char *)p_in;
+ exp_xdr_encode_u32(p_in, len - 4);
+
+ error = 0;
+out:
+ dprintk("%s: End err %d xdrlen %d\n",
+ __func__, error, len);
+ return error;
+}
+EXPORT_SYMBOL(filelayout_encode_devinfo);
+
+/* Encodes the loc_body structure from draft 13
+ * on the response stream.
+ * Use linux error codes (not nfs) since these values are being
+ * returned to the file system.
+ */
+enum nfsstat4
+filelayout_encode_layout(struct exp_xdr_stream *xdr,
+ const struct pnfs_filelayout_layout *flp)
+{
+ u32 len = 0, nfl_util, fhlen, i;
+ u32 *layoutlen_p;
+ enum nfsstat4 nfserr;
+ __be32 *p;
+
+ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n",
+ __func__,
+ flp->device_id.pnfs_fsid,
+ flp->device_id.pnfs_devid,
+ flp->lg_first_stripe_index,
+ flp->lg_fh_length);
+
+ /* Ensure file system added at least one file handle */
+ if (flp->lg_fh_length <= 0) {
+ dprintk("%s: File Layout has no file handles!!\n", __func__);
+ nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
+ goto out;
+ }
+
+ /* Ensure room for len, devid, util, first_stripe_index,
+ * pattern_offset, number of filehandles */
+ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1);
+ if (!p) {
+ nfserr = NFS4ERR_TOOSMALL;
+ goto out;
+ }
+
+ /* save spot for opaque file layout length, fill-in later*/
+ p++;
+
+ /* encode device id */
+ p = exp_xdr_encode_u64(p, flp->device_id.sbid);
+ p = exp_xdr_encode_u64(p, flp->device_id.devid);
+
+ /* set and encode flags */
+ nfl_util = flp->lg_stripe_unit;
+ if (flp->lg_commit_through_mds)
+ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS;
+ if (flp->lg_stripe_type == STRIPE_DENSE)
+ nfl_util |= NFL4_UFLG_DENSE;
+ p = exp_xdr_encode_u32(p, nfl_util);
+
+ /* encode first stripe index */
+ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index);
+
+ /* encode striping pattern start */
+ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset);
+
+ /* encode number of file handles */
+ p = exp_xdr_encode_u32(p, flp->lg_fh_length);
+
+ /* encode file handles */
+ for (i = 0; i < flp->lg_fh_length; i++) {
+ fhlen = flp->lg_fh_list[i].fh_size;
+ p = exp_xdr_reserve_space(xdr, 4 + fhlen);
+ if (!p) {
+ nfserr = NFS4ERR_TOOSMALL;
+ goto out;
+ }
+ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen);
+ }
+
+ /* Set number of bytes encoded = total_bytes_encoded - length var */
+ len = (char *)p - (char *)layoutlen_p;
+ exp_xdr_encode_u32(layoutlen_p, len - 4);
+
+ nfserr = NFS4_OK;
+out:
+ dprintk("%s: End err %u xdrlen %d\n",
+ __func__, nfserr, len);
+ return nfserr;
+}
+EXPORT_SYMBOL(filelayout_encode_layout);
diff -up linux-2.6.38.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.38.noarch/fs/exportfs/pnfs_osd_xdr_srv.c
--- linux-2.6.38.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2011-03-26 07:57:44.232821674 -0400
+++ linux-2.6.38.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2011-03-26 07:57:44.232821674 -0400
@@ -0,0 +1,289 @@
+/*
+ * pnfs_osd_xdr_enc.c
+ *
+ * Object-Based pNFS Layout XDR layer
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/nfsd/nfsd4_pnfs.h>
+#include <linux/pnfs_osd_xdr.h>
+
+/*
+ * struct pnfs_osd_data_map {
+ * u32 odm_num_comps;
+ * u64 odm_stripe_unit;
+ * u32 odm_group_width;
+ * u32 odm_group_depth;
+ * u32 odm_mirror_cnt;
+ * u32 odm_raid_algorithm;
+ * };
+ */
+static int pnfs_osd_xdr_encode_data_map(
+ struct exp_xdr_stream *xdr,
+ struct pnfs_osd_data_map *data_map)
+{
+ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1);
+
+ if (!p)
+ return -E2BIG;
+
+ p = exp_xdr_encode_u32(p, data_map->odm_num_comps);
+ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit);
+ p = exp_xdr_encode_u32(p, data_map->odm_group_width);
+ p = exp_xdr_encode_u32(p, data_map->odm_group_depth);
+ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt);
+ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm);
+
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ * struct pnfs_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ * };
+ */
+static inline int pnfs_osd_xdr_encode_objid(
+ struct exp_xdr_stream *xdr,
+ struct pnfs_osd_objid *object_id)
+{
+ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2);
+ struct nfsd4_pnfs_deviceid *dev_id =
+ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id;
+
+ if (!p)
+ return -E2BIG;
+
+ p = exp_xdr_encode_u64(p, dev_id->sbid);
+ p = exp_xdr_encode_u64(p, dev_id->devid);
+ p = exp_xdr_encode_u64(p, object_id->oid_partition_id);
+ p = exp_xdr_encode_u64(p, object_id->oid_object_id);
+
+ return 0;
+}
+
+/*
+ * enum pnfs_osd_cap_key_sec4 {
+ * PNFS_OSD_CAP_KEY_SEC_NONE = 0,
+ * PNFS_OSD_CAP_KEY_SEC_SSV = 1
+ * };
+ *
+ * struct pnfs_osd_object_cred {
+ * struct pnfs_osd_objid oc_object_id;
+ * u32 oc_osd_version;
+ * u32 oc_cap_key_sec;
+ * struct pnfs_osd_opaque_cred oc_cap_key
+ * struct pnfs_osd_opaque_cred oc_cap;
+ * };
+ */
+static int pnfs_osd_xdr_encode_object_cred(
+ struct exp_xdr_stream *xdr,
+ struct pnfs_osd_object_cred *olo_comp)
+{
+ __be32 *p;
+ int err;
+
+ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id);
+ if (err)
+ return err;
+
+ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len);
+ if (!p)
+ return -E2BIG;
+
+ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version);
+
+ /* No sec for now */
+ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE);
+ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */
+
+ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred,
+ olo_comp->oc_cap.cred_len);
+
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_layout {
+ * struct pnfs_osd_data_map olo_map;
+ * u32 olo_comps_index;
+ * u32 olo_num_comps;
+ * struct pnfs_osd_object_cred *olo_comps;
+ * };
+ */
+int pnfs_osd_xdr_encode_layout(
+ struct exp_xdr_stream *xdr,
+ struct pnfs_osd_layout *pol)
+{
+ __be32 *p;
+ u32 i;
+ int err;
+
+ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map);
+ if (err)
+ return err;
+
+ p = exp_xdr_reserve_qwords(xdr, 2);
+ if (!p)
+ return -E2BIG;
+
+ p = exp_xdr_encode_u32(p, pol->olo_comps_index);
+ p = exp_xdr_encode_u32(p, pol->olo_num_comps);
+
+ for (i = 0; i < pol->olo_num_comps; i++) {
+ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout);
+
+static int _encode_string(struct exp_xdr_stream *xdr,
+ const struct nfs4_string *str)
+{
+ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len);
+
+ if (!p)
+ return -E2BIG;
+ exp_xdr_encode_opaque(p, str->data, str->len);
+ return 0;
+}
+
+/* struct pnfs_osd_deviceaddr {
+ * struct pnfs_osd_targetid oda_targetid;
+ * struct pnfs_osd_targetaddr oda_targetaddr;
+ * u8 oda_lun[8];
+ * struct nfs4_string oda_systemid;
+ * struct pnfs_osd_object_cred oda_root_obj_cred;
+ * struct nfs4_string oda_osdname;
+ * };
+ */
+int pnfs_osd_xdr_encode_deviceaddr(
+ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr)
+{
+ __be32 *p;
+ int err;
+
+ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun));
+ if (!p)
+ return -E2BIG;
+
+ /* Empty oda_targetid */
+ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON);
+
+ /* Empty oda_targetaddr for now */
+ p = exp_xdr_encode_u32(p, 0);
+
+ /* oda_lun */
+ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun));
+
+ err = _encode_string(xdr, &devaddr->oda_systemid);
+ if (err)
+ return err;
+
+ err = pnfs_osd_xdr_encode_object_cred(xdr,
+ &devaddr->oda_root_obj_cred);
+ if (err)
+ return err;
+
+ err = _encode_string(xdr, &devaddr->oda_osdname);
+ if (err)
+ return err;
+
+ return 0;
+}
+EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr);
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ * u32 dsu_valid;
+ * s64 dsu_delta;
+ * u32 olu_ioerr_flag;
+ * };
+ */
+__be32 *
+pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p)
+{
+ lou->dsu_valid = be32_to_cpu(*p++);
+ if (lou->dsu_valid)
+ p = xdr_decode_hyper(p, &lou->dsu_delta);
+ lou->olu_ioerr_flag = be32_to_cpu(*p++);
+ return p;
+}
+EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate);
+
+/*
+ * struct pnfs_osd_objid {
+ * struct pnfs_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ * };
+ */
+static inline __be32 *
+pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+ /* FIXME: p = xdr_decode_fixed(...) */
+ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data));
+ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data));
+
+ p = xdr_decode_hyper(p, &objid->oid_partition_id);
+ p = xdr_decode_hyper(p, &objid->oid_object_id);
+ return p;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ * struct pnfs_osd_objid oer_component;
+ * u64 oer_comp_offset;
+ * u64 oer_comp_length;
+ * u32 oer_iswrite;
+ * u32 oer_errno;
+ * };
+ */
+__be32 *
+pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p)
+{
+ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component);
+ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset);
+ p = xdr_decode_hyper(p, &ioerr->oer_comp_length);
+ ioerr->oer_iswrite = be32_to_cpu(*p++);
+ ioerr->oer_errno = be32_to_cpu(*p++);
+ return p;
+}
+EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr);
diff -up linux-2.6.38.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.38.noarch/fs/gfs2/ops_fstype.c
--- linux-2.6.38.noarch/fs/gfs2/ops_fstype.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/gfs2/ops_fstype.c 2011-03-26 07:57:44.233821664 -0400
@@ -18,6 +18,7 @@
#include <linux/mount.h>
#include <linux/gfs2_ondisk.h>
#include <linux/quotaops.h>
+#include <linux/nfsd/nfs4pnfsdlm.h>
#include "gfs2.h"
#include "incore.h"
@@ -1107,6 +1108,9 @@ static int fill_super(struct super_block
sb->s_op = &gfs2_super_ops;
sb->s_d_op = &gfs2_dops;
sb->s_export_op = &gfs2_export_ops;
+#if defined(CONFIG_PNFSD)
+ sb->s_pnfs_op = &pnfs_dlm_export_ops;
+#endif /* CONFIG_PNFSD */
sb->s_xattr = gfs2_xattr_handlers;
sb->s_qcop = &gfs2_quotactl_ops;
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
diff -up linux-2.6.38.noarch/fs/Kconfig.orig linux-2.6.38.noarch/fs/Kconfig
--- linux-2.6.38.noarch/fs/Kconfig.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/Kconfig 2011-03-26 07:57:44.221821816 -0400
@@ -49,6 +49,28 @@ config FS_POSIX_ACL
config EXPORTFS
tristate
+config EXPORTFS_FILE_LAYOUT
+ bool
+ depends on PNFSD && EXPORTFS
+ help
+ Exportfs support for the NFSv4.1 files layout type.
+ Must be automatically selected by supporting filesystems.
+
+config EXPORTFS_OSD_LAYOUT
+ bool
+ depends on PNFSD && EXPORTFS
+ help
+ Exportfs support for the NFSv4.1 objects layout type.
+ Must be automatically selected by supporting osd
+ filesystems.
+
+config EXPORTFS_BLOCK_LAYOUT
+ bool
+ depends on PNFSD && EXPORTFS
+ help
+ Exportfs support for the NFSv4.1 blocks layout type.
+ Must be automatically selected by supporting filesystems.
+
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
default y
diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c
--- linux-2.6.38.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2011-03-26 07:57:44.235821643 -0400
+++ linux-2.6.38.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2011-03-26 07:57:44.235821643 -0400
@@ -0,0 +1,66 @@
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+struct pipefs_list bl_device_list;
+struct dentry *bl_device_pipe;
+
+ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len)
+{
+ int err;
+ struct pipefs_hdr *msg;
+
+ dprintk("Entering %s...\n", __func__);
+
+ msg = pipefs_readmsg(filp, src, len);
+ if (IS_ERR(msg)) {
+ dprintk("ERROR: unable to read pipefs message.\n");
+ return PTR_ERR(msg);
+ }
+
+ /* now assign the result, which wakes the blocked thread */
+ err = pipefs_assign_upcall_reply(msg, &bl_device_list);
+ if (err) {
+ dprintk("ERROR: failed to assign upcall with id %u\n",
+ msg->msgid);
+ kfree(msg);
+ }
+ return len;
+}
+
+static const struct rpc_pipe_ops bl_pipe_ops = {
+ .upcall = pipefs_generic_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = pipefs_generic_destroy_msg,
+};
+
+int bl_pipe_init(void)
+{
+ dprintk("%s: block_device pipefs registering...\n", __func__);
+ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1);
+ if (IS_ERR(bl_device_pipe))
+ dprintk("ERROR, unable to make block_device pipe\n");
+
+ if (!bl_device_pipe)
+ dprintk("bl_device_pipe is NULL!\n");
+ else
+ dprintk("bl_device_pipe created!\n");
+ pipefs_init_list(&bl_device_list);
+ return 0;
+}
+
+void bl_pipe_exit(void)
+{
+ dprintk("%s: block_device pipefs unregistering...\n", __func__);
+ if (IS_ERR(bl_device_pipe))
+ return ;
+ pipefs_closepipe(bl_device_pipe);
+ return;
+}
diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.c
--- linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2011-03-26 07:57:44.237821622 -0400
+++ linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.c 2011-03-26 07:57:44.237821622 -0400
@@ -0,0 +1,1146 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/buffer_head.h> /* various write calls */
+#include <linux/bio.h> /* struct bio */
+#include <linux/vmalloc.h>
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+
+/* Callback operations to the pNFS client */
+
+static void print_page(struct page *page)
+{
+ dprintk("PRINTPAGE page %p\n", page);
+ dprintk(" PagePrivate %d\n", PagePrivate(page));
+ dprintk(" PageUptodate %d\n", PageUptodate(page));
+ dprintk(" PageError %d\n", PageError(page));
+ dprintk(" PageDirty %d\n", PageDirty(page));
+ dprintk(" PageReferenced %d\n", PageReferenced(page));
+ dprintk(" PageLocked %d\n", PageLocked(page));
+ dprintk(" PageWriteback %d\n", PageWriteback(page));
+ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
+ dprintk("\n");
+}
+
+/* Given the be associated with isect, determine if page data needs to be
+ * initialized.
+ */
+static int is_hole(struct pnfs_block_extent *be, sector_t isect)
+{
+ if (be->be_state == PNFS_BLOCK_NONE_DATA)
+ return 1;
+ else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
+ return 0;
+ else
+ return !is_sector_initialized(be->be_inval, isect);
+}
+
+/* Given the be associated with isect, determine if page data can be
+ * written to disk.
+ */
+static int is_writable(struct pnfs_block_extent *be, sector_t isect)
+{
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA)
+ return 1;
+ else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
+ return 0;
+ else
+ return is_sector_initialized(be->be_inval, isect);
+}
+
+static int
+dont_like_caller(struct nfs_page *req)
+{
+ if (atomic_read(&req->wb_complete)) {
+ /* Called by _multi */
+ return 1;
+ } else {
+ /* Called by _one */
+ return 0;
+ }
+}
+
+static enum pnfs_try_status
+bl_commit(struct nfs_write_data *nfs_data,
+ int sync)
+{
+ dprintk("%s enter\n", __func__);
+ return PNFS_NOT_ATTEMPTED;
+}
+
+/* The data we are handed might be spread across several bios. We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+ struct kref refcnt;
+ struct rpc_call_ops call_ops;
+ void (*pnfs_callback) (void *data);
+ void *data;
+};
+
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+ struct parallel_io *rv;
+
+ rv = kmalloc(sizeof(*rv), GFP_KERNEL);
+ if (rv) {
+ rv->data = data;
+ kref_init(&rv->refcnt);
+ }
+ return rv;
+}
+
+static inline void get_parallel(struct parallel_io *p)
+{
+ kref_get(&p->refcnt);
+}
+
+static void destroy_parallel(struct kref *kref)
+{
+ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+
+ dprintk("%s enter\n", __func__);
+ p->pnfs_callback(p->data);
+ kfree(p);
+}
+
+static inline void put_parallel(struct parallel_io *p)
+{
+ kref_put(&p->refcnt, destroy_parallel);
+}
+
+static struct bio *
+bl_submit_bio(int rw, struct bio *bio)
+{
+ if (bio) {
+ get_parallel(bio->bi_private);
+ dprintk("%s submitting %s bio %u@%llu\n", __func__,
+ rw == READ ? "read" : "write",
+ bio->bi_size, (u64)bio->bi_sector);
+ submit_bio(rw, bio);
+ }
+ return NULL;
+}
+
+static inline void
+bl_done_with_rpage(struct page *page, const int ok)
+{
+ if (ok) {
+ ClearPagePnfsErr(page);
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ SetPagePnfsErr(page);
+ }
+ /* Page is unlocked via rpc_release. Should really be done here. */
+}
+
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_read(struct bio *bio, int err)
+{
+ void *data = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ bl_done_with_rpage(page, uptodate);
+ } while (bvec >= bio->bi_io_vec);
+ bio_put(bio);
+ put_parallel(data);
+}
+
+static void bl_read_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+ pnfs_read_done(rdata);
+}
+
+static void
+bl_end_par_io_read(void *data)
+{
+ struct nfs_read_data *rdata = data;
+
+ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
+ schedule_work(&rdata->task.u.tk_work);
+}
+
+/* We don't want normal .rpc_call_done callback used, so we replace it
+ * with this stub.
+ */
+static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
+{
+ return;
+}
+
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_read_data *rdata,
+ unsigned nr_pages)
+{
+ int i, hole;
+ struct bio *bio = NULL;
+ struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par;
+ loff_t f_offset = rdata->args.offset;
+ size_t count = rdata->args.count;
+ struct page **pages = rdata->args.pages;
+ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+
+ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
+ nr_pages, f_offset, count);
+
+ if (dont_like_caller(rdata->req)) {
+ dprintk("%s dont_like_caller failed\n", __func__);
+ goto use_mds;
+ }
+ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) {
+ /* We want to fall back to mds in case of read_page
+ * after error on read_pages.
+ */
+ dprintk("%s PG_pnfserr set\n", __func__);
+ goto use_mds;
+ }
+ par = alloc_parallel(rdata);
+ if (!par)
+ goto use_mds;
+ par->call_ops = *rdata->pdata.call_ops;
+ par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+ par->pnfs_callback = bl_end_par_io_read;
+ /* At this point, we can no longer jump to use_mds */
+
+ isect = (sector_t) (f_offset >> 9);
+ /* Code assumes extents are page-aligned */
+ for (i = pg_index; i < nr_pages; i++) {
+ if (!extent_length) {
+ /* We've used up the previous extent */
+ put_extent(be);
+ put_extent(cow_read);
+ bio = bl_submit_bio(READ, bio);
+ /* Get the next one */
+ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg),
+ isect, &cow_read);
+ if (!be) {
+ /* Error out this page */
+ bl_done_with_rpage(pages[i], 0);
+ break;
+ }
+ extent_length = be->be_length -
+ (isect - be->be_f_offset);
+ if (cow_read) {
+ sector_t cow_length = cow_read->be_length -
+ (isect - cow_read->be_f_offset);
+ extent_length = min(extent_length, cow_length);
+ }
+ }
+ hole = is_hole(be, isect);
+ if (hole && !cow_read) {
+ bio = bl_submit_bio(READ, bio);
+ /* Fill hole w/ zeroes w/o accessing device */
+ dprintk("%s Zeroing page for hole\n", __func__);
+ zero_user(pages[i], 0,
+ min_t(int, PAGE_CACHE_SIZE, count));
+ print_page(pages[i]);
+ bl_done_with_rpage(pages[i], 1);
+ } else {
+ struct pnfs_block_extent *be_read;
+
+ be_read = (hole && cow_read) ? cow_read : be;
+ for (;;) {
+ if (!bio) {
+ bio = bio_alloc(GFP_NOIO, nr_pages - i);
+ if (!bio) {
+ /* Error out this page */
+ bl_done_with_rpage(pages[i], 0);
+ break;
+ }
+ bio->bi_sector = isect -
+ be_read->be_f_offset +
+ be_read->be_v_offset;
+ bio->bi_bdev = be_read->be_mdev;
+ bio->bi_end_io = bl_end_io_read;
+ bio->bi_private = par;
+ }
+ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0))
+ break;
+ bio = bl_submit_bio(READ, bio);
+ }
+ }
+ isect += PAGE_CACHE_SIZE >> 9;
+ extent_length -= PAGE_CACHE_SIZE >> 9;
+ }
+ if ((isect << 9) >= rdata->inode->i_size) {
+ rdata->res.eof = 1;
+ rdata->res.count = rdata->inode->i_size - f_offset;
+ } else {
+ rdata->res.count = (isect << 9) - f_offset;
+ }
+ put_extent(be);
+ put_extent(cow_read);
+ bl_submit_bio(READ, bio);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+
+ use_mds:
+ dprintk("Giving up and using normal NFS\n");
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static void mark_extents_written(struct pnfs_block_layout *bl,
+ __u64 offset, __u32 count)
+{
+ sector_t isect, end;
+ struct pnfs_block_extent *be;
+
+ dprintk("%s(%llu, %u)\n", __func__, offset, count);
+ if (count == 0)
+ return;
+ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9;
+ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
+ end >>= 9;
+ while (isect < end) {
+ sector_t len;
+ be = find_get_extent(bl, isect, NULL);
+ BUG_ON(!be); /* FIXME */
+ len = min(end, be->be_f_offset + be->be_length) - isect;
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+ mark_for_commit(be, isect, len); /* What if fails? */
+ isect += len;
+ put_extent(be);
+ }
+}
+
+/* STUB - this needs thought */
+static inline void
+bl_done_with_wpage(struct page *page, const int ok)
+{
+ if (!ok) {
+ SetPageError(page);
+ SetPagePnfsErr(page);
+ /* This is an inline copy of nfs_zap_mapping */
+ /* This is oh so fishy, and needs deep thought */
+ if (page->mapping->nrpages != 0) {
+ struct inode *inode = page->mapping->host;
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
+ }
+ }
+ /* end_page_writeback called in rpc_release. Should be done here. */
+}
+
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_write(struct bio *bio, int err)
+{
+ void *data = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ bl_done_with_wpage(page, uptodate);
+ } while (bvec >= bio->bi_io_vec);
+ bio_put(bio);
+ put_parallel(data);
+}
+
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+ if (!wdata->task.tk_status) {
+ /* Marks for LAYOUTCOMMIT */
+ /* BUG - this should be called after each bio, not after
+ * all finish, unless have some way of storing success/failure
+ */
+ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg),
+ wdata->args.offset, wdata->args.count);
+ }
+ pnfs_writeback_done(wdata);
+}
+
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void
+bl_end_par_io_write(void *data)
+{
+ struct nfs_write_data *wdata = data;
+
+ /* STUB - ignoring error handling */
+ wdata->task.tk_status = 0;
+ wdata->verf.committed = NFS_FILE_SYNC;
+ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
+ schedule_work(&wdata->task.u.tk_work);
+}
+
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_write_data *wdata,
+ unsigned nr_pages,
+ int sync)
+{
+ int i;
+ struct bio *bio = NULL;
+ struct pnfs_block_extent *be = NULL;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par;
+ loff_t offset = wdata->args.offset;
+ size_t count = wdata->args.count;
+ struct page **pages = wdata->args.pages;
+ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+
+ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+ if (!wdata->req->wb_lseg) {
+ dprintk("%s no lseg, falling back to MDS\n", __func__);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ if (dont_like_caller(wdata->req)) {
+ dprintk("%s dont_like_caller failed\n", __func__);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+ * We want to write each, and if there is an error remove it from
+ * list and call
+ * nfs_retry_request(req) to have it redone using nfs.
+ * QUEST? Do as block or per req? Think have to do per block
+ * as part of end_bio
+ */
+ par = alloc_parallel(wdata);
+ if (!par)
+ return PNFS_NOT_ATTEMPTED;
+ par->call_ops = *wdata->pdata.call_ops;
+ par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+ par->pnfs_callback = bl_end_par_io_write;
+ /* At this point, have to be more careful with error handling */
+
+ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9);
+ for (i = pg_index; i < nr_pages; i++) {
+ if (!extent_length) {
+ /* We've used up the previous extent */
+ put_extent(be);
+ bio = bl_submit_bio(WRITE, bio);
+ /* Get the next one */
+ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg),
+ isect, NULL);
+ if (!be || !is_writable(be, isect)) {
+ /* FIXME */
+ bl_done_with_wpage(pages[i], 0);
+ break;
+ }
+ extent_length = be->be_length -
+ (isect - be->be_f_offset);
+ }
+ for (;;) {
+ if (!bio) {
+ bio = bio_alloc(GFP_NOIO, nr_pages - i);
+ if (!bio) {
+ /* Error out this page */
+ /* FIXME */
+ bl_done_with_wpage(pages[i], 0);
+ break;
+ }
+ bio->bi_sector = isect - be->be_f_offset +
+ be->be_v_offset;
+ bio->bi_bdev = be->be_mdev;
+ bio->bi_end_io = bl_end_io_write;
+ bio->bi_private = par;
+ }
+ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0))
+ break;
+ bio = bl_submit_bio(WRITE, bio);
+ }
+ isect += PAGE_CACHE_SIZE >> 9;
+ extent_length -= PAGE_CACHE_SIZE >> 9;
+ }
+ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK);
+ put_extent(be);
+ bl_submit_bio(WRITE, bio);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+}
+
+/* FIXME - range ignored */
+static void
+release_extents(struct pnfs_block_layout *bl,
+ struct pnfs_layout_range *range)
+{
+ int i;
+ struct pnfs_block_extent *be;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ while (!list_empty(&bl->bl_extents[i])) {
+ be = list_first_entry(&bl->bl_extents[i],
+ struct pnfs_block_extent,
+ be_node);
+ list_del(&be->be_node);
+ put_extent(be);
+ }
+ }
+ spin_unlock(&bl->bl_ext_lock);
+}
+
+static void
+release_inval_marks(struct pnfs_inval_markings *marks)
+{
+ struct pnfs_inval_tracking *pos, *temp;
+
+ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
+ list_del(&pos->it_link);
+ kfree(pos);
+ }
+ return;
+}
+
+/* Note we are relying on caller locking to prevent nasty races. */
+static void
+bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+
+ dprintk("%s enter\n", __func__);
+ release_extents(bl, NULL);
+ release_inval_marks(&bl->bl_inval);
+ kfree(bl);
+}
+
+static struct pnfs_layout_hdr *
+bl_alloc_layout_hdr(struct inode *inode)
+{
+ struct pnfs_block_layout *bl;
+
+ dprintk("%s enter\n", __func__);
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return NULL;
+ spin_lock_init(&bl->bl_ext_lock);
+ INIT_LIST_HEAD(&bl->bl_extents[0]);
+ INIT_LIST_HEAD(&bl->bl_extents[1]);
+ INIT_LIST_HEAD(&bl->bl_commit);
+ bl->bl_count = 0;
+ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9;
+ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+ return &bl->bl_layout;
+}
+
+static void
+bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ dprintk("%s enter\n", __func__);
+ kfree(lseg);
+}
+
+/* Because the generic infrastructure does not correctly merge layouts,
+ * we pretty much ignore lseg, and store all data layout wide, so we
+ * can correctly merge. Eventually we should push some correct merge
+ * behavior up to the generic code, as the current behavior tends to
+ * cause lots of unnecessary overlapping LAYOUTGET requests.
+ */
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr)
+{
+ struct pnfs_layout_segment *lseg;
+ int status;
+
+ dprintk("%s enter\n", __func__);
+ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL);
+ if (!lseg)
+ return NULL;
+ status = nfs4_blk_process_layoutget(lo, lgr);
+ if (status) {
+ /* We don't want to call the full-blown bl_free_lseg,
+ * since on error extents were not touched.
+ */
+ /* STUB - we really want to distinguish between 2 error
+ * conditions here. This lseg failed, but lo data structures
+ * are OK, or we hosed the lo data structures. The calling
+ * code probably needs to distinguish this too.
+ */
+ kfree(lseg);
+ return ERR_PTR(status);
+ }
+ return lseg;
+}
+
+static int
+bl_setup_layoutcommit(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutcommit_args *arg)
+{
+ struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
+ struct bl_layoutupdate_data *layoutupdate_data;
+
+ dprintk("%s enter\n", __func__);
+ /* Need to ensure commit is block-size aligned */
+ if (nfss->pnfs_blksize) {
+ u64 mask = nfss->pnfs_blksize - 1;
+ u64 offset = arg->range.offset & mask;
+
+ arg->range.offset -= offset;
+ arg->range.length += offset + mask;
+ arg->range.length &= ~mask;
+ }
+
+ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data),
+ GFP_KERNEL);
+ if (unlikely(!layoutupdate_data))
+ return -ENOMEM;
+ INIT_LIST_HEAD(&layoutupdate_data->ranges);
+ arg->layoutdriver_data = layoutupdate_data;
+
+ return 0;
+}
+
+static void
+bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg)
+{
+ dprintk("%s enter\n", __func__);
+ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+}
+
+static void
+bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutcommit_data *lcdata)
+{
+ dprintk("%s enter\n", __func__);
+ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
+ kfree(lcdata->args.layoutdriver_data);
+}
+
+static void free_blk_mountid(struct block_mount_id *mid)
+{
+ if (mid) {
+ struct pnfs_block_dev *dev;
+ spin_lock(&mid->bm_lock);
+ while (!list_empty(&mid->bm_devlist)) {
+ dev = list_first_entry(&mid->bm_devlist,
+ struct pnfs_block_dev,
+ bm_node);
+ list_del(&dev->bm_node);
+ free_block_dev(dev);
+ }
+ spin_unlock(&mid->bm_lock);
+ kfree(mid);
+ }
+}
+
+/* This is mostly copied from the filelayout's get_device_info function.
+ * It seems much of this should be at the generic pnfs level.
+ */
+static struct pnfs_block_dev *
+nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
+ struct nfs4_deviceid *d_id,
+ struct list_head *sdlist)
+{
+ struct pnfs_device *dev;
+ struct pnfs_block_dev *rv = NULL;
+ u32 max_resp_sz;
+ int max_pages;
+ struct page **pages = NULL;
+ int i, rc;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ max_pages = max_resp_sz >> PAGE_SHIFT;
+ dprintk("%s max_resp_sz %u max_pages %d\n",
+ __func__, max_resp_sz, max_pages);
+
+ dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ dprintk("%s kmalloc failed\n", __func__);
+ return NULL;
+ }
+
+ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
+ if (pages == NULL) {
+ kfree(dev);
+ return NULL;
+ }
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto out_free;
+ }
+
+ /* set dev->area */
+ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
+ if (!dev->area)
+ goto out_free;
+
+ memcpy(&dev->dev_id, d_id, sizeof(*d_id));
+ dev->layout_type = LAYOUT_BLOCK_VOLUME;
+ dev->pages = pages;
+ dev->pgbase = 0;
+ dev->pglen = PAGE_SIZE * max_pages;
+ dev->mincount = 0;
+
+ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
+ rc = nfs4_proc_getdeviceinfo(server, dev);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc)
+ goto out_free;
+
+ rv = nfs4_blk_decode_device(server, dev, sdlist);
+ out_free:
+ if (dev->area != NULL)
+ vunmap(dev->area);
+ for (i = 0; i < max_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+ kfree(dev);
+ return rv;
+}
+
+
+/*
+ * Retrieve the list of available devices for the mountpoint.
+ */
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+ struct block_mount_id *b_mt_id = NULL;
+ struct pnfs_mount_type *mtype = NULL;
+ struct pnfs_devicelist *dlist = NULL;
+ struct pnfs_block_dev *bdev;
+ LIST_HEAD(block_disklist);
+ int status = 0, i;
+
+ dprintk("%s enter\n", __func__);
+
+ if (server->pnfs_blksize == 0) {
+ dprintk("%s Server did not return blksize\n", __func__);
+ return -EINVAL;
+ }
+ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL);
+ if (!b_mt_id) {
+ status = -ENOMEM;
+ goto out_error;
+ }
+ /* Initialize nfs4 block layout mount id */
+ spin_lock_init(&b_mt_id->bm_lock);
+ INIT_LIST_HEAD(&b_mt_id->bm_devlist);
+
+ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL);
+ if (!dlist)
+ goto out_error;
+ dlist->eof = 0;
+ while (!dlist->eof) {
+ status = nfs4_proc_getdevicelist(server, fh, dlist);
+ if (status)
+ goto out_error;
+ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
+ __func__, dlist->num_devs, dlist->eof);
+ /* For each device returned in dlist, call GETDEVICEINFO, and
+ * decode the opaque topology encoding to create a flat
+ * volume topology, matching VOLUME_SIMPLE disk signatures
+ * to disks in the visible block disk list.
+ * Construct an LVM meta device from the flat volume topology.
+ */
+ for (i = 0; i < dlist->num_devs; i++) {
+ bdev = nfs4_blk_get_deviceinfo(server, fh,
+ &dlist->dev_id[i],
+ &block_disklist);
+ if (!bdev) {
+ status = -ENODEV;
+ goto out_error;
+ }
+ spin_lock(&b_mt_id->bm_lock);
+ list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
+ spin_unlock(&b_mt_id->bm_lock);
+ }
+ }
+ dprintk("%s SUCCESS\n", __func__);
+ server->pnfs_ld_data = b_mt_id;
+
+ out_return:
+ kfree(dlist);
+ return status;
+
+ out_error:
+ free_blk_mountid(b_mt_id);
+ kfree(mtype);
+ goto out_return;
+}
+
+static int
+bl_clear_layoutdriver(struct nfs_server *server)
+{
+ struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+
+ dprintk("%s enter\n", __func__);
+ free_blk_mountid(b_mt_id);
+ dprintk("%s RETURNS\n", __func__);
+ return 0;
+}
+
+/* STUB - mark intersection of layout and page as bad, so is not
+ * used again.
+ */
+static void mark_bad_read(void)
+{
+ return;
+}
+
+/* Copied from buffer.c */
+static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ } else {
+ /* This happens, due to failed READA attempts. */
+ clear_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+}
+
+/* Copied from buffer.c */
+static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
+{
+ __end_buffer_read_notouch(bh, uptodate);
+}
+
+/*
+ * map_block: map a requested I/0 block (isect) into an offset in the LVM
+ * meta block_device
+ */
+static void
+map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh)
+{
+ dprintk("%s enter be=%p\n", __func__, be);
+
+ set_buffer_mapped(bh);
+ bh->b_bdev = be->be_mdev;
+ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
+ (be->be_mdev->bd_inode->i_blkbits - 9);
+
+ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n",
+ __func__, (long)isect,
+ (long)bh->b_blocknr,
+ bh->b_size);
+ return;
+}
+
+/* Given an unmapped page, zero it (or read in page for COW),
+ * and set appropriate flags/markings, but it is safe to not initialize
+ * the range given in [from, to).
+ */
+/* This is loosely based on nobh_write_begin */
+static int
+init_page_for_write(struct pnfs_block_layout *bl, struct page *page,
+ unsigned from, unsigned to, sector_t **pages_to_mark)
+{
+ struct buffer_head *bh;
+ int inval, ret = -EIO;
+ struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ sector_t isect;
+
+ dprintk("%s enter, %p\n", __func__, page);
+ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
+ if (!bh) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+
+ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9);
+ be = find_get_extent(bl, isect, &cow_read);
+ if (!be)
+ goto cleanup;
+ inval = is_hole(be, isect);
+ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to);
+ if (inval) {
+ if (be->be_state == PNFS_BLOCK_NONE_DATA) {
+ dprintk("%s PANIC - got NONE_DATA extent %p\n",
+ __func__, be);
+ goto cleanup;
+ }
+ map_block(isect, be, bh);
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+ if (PageUptodate(page)) {
+ /* Do nothing */
+ } else if (inval & !cow_read) {
+ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE);
+ } else if (0 < from || PAGE_CACHE_SIZE > to) {
+ struct pnfs_block_extent *read_extent;
+
+ read_extent = (inval && cow_read) ? cow_read : be;
+ map_block(isect, read_extent, bh);
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_read_nobh;
+ submit_bh(READ, bh);
+ dprintk("%s: Waiting for buffer read\n", __func__);
+ /* XXX Don't really want to hold layout lock here */
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ goto cleanup;
+ }
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ /* There is a BUG here if is a short copy after write_begin,
+ * but I think this is a generic fs bug. The problem is that
+ * we have marked the page as initialized, but it is possible
+ * that the section not copied may never get copied.
+ */
+ ret = mark_initialized_sectors(be->be_inval, isect,
+ PAGE_CACHE_SECTORS,
+ pages_to_mark);
+ /* Want to preallocate mem so above can't fail */
+ if (ret)
+ goto cleanup;
+ }
+ SetPageMappedToDisk(page);
+ ret = 0;
+
+cleanup:
+ free_buffer_head(bh);
+ put_extent(be);
+ put_extent(cow_read);
+ if (ret) {
+ /* Need to mark layout with bad read...should now
+ * just use nfs4 for reads and writes.
+ */
+ mark_bad_read();
+ }
+ return ret;
+}
+
+static int
+bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos,
+ unsigned count, struct pnfs_fsdata *fsdata)
+{
+ unsigned from, to;
+ int ret;
+ sector_t *pages_to_mark = NULL;
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg);
+
+ dprintk("%s enter, %u@%lld\n", __func__, count, pos);
+ print_page(page);
+ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */
+ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) {
+ dprintk("%s Can't handle blocksize %llu\n", __func__,
+ (u64)bl->bl_blocksize);
+ put_lseg(fsdata->lseg);
+ fsdata->lseg = NULL;
+ return 0;
+ }
+ if (PageMappedToDisk(page)) {
+ /* Basically, this is a flag that says we have
+ * successfully called write_begin already on this page.
+ */
+ /* NOTE - there are cache consistency issues here.
+ * For example, what if the layout is recalled, then regained?
+ * If the file is closed and reopened, will the page flags
+ * be reset? If not, we'll have to use layout info instead of
+ * the page flag.
+ */
+ return 0;
+ }
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + count;
+ ret = init_page_for_write(bl, page, from, to, &pages_to_mark);
+ if (ret) {
+ dprintk("%s init page failed with %i", __func__, ret);
+ /* Revert back to plain NFS and just continue on with
+ * write. This assumes there is no request attached, which
+ * should be true if we get here.
+ */
+ BUG_ON(PagePrivate(page));
+ put_lseg(fsdata->lseg);
+ fsdata->lseg = NULL;
+ kfree(pages_to_mark);
+ ret = 0;
+ } else {
+ fsdata->private = pages_to_mark;
+ }
+ return ret;
+}
+
+/* CAREFUL - what happens if copied < count??? */
+static int
+bl_write_end(struct inode *inode, struct page *page, loff_t pos,
+ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg)
+{
+ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg);
+ print_page(page);
+ if (lseg)
+ SetPageUptodate(page);
+ return 0;
+}
+
+/* Return any memory allocated to fsdata->private, and take advantage
+ * of no page locks to mark pages noted in write_begin as needing
+ * initialization.
+ */
+static void
+bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata)
+{
+ struct page *page;
+ pgoff_t index;
+ sector_t *pos;
+ struct address_space *mapping = filp->f_mapping;
+ struct pnfs_fsdata *fake_data;
+ struct pnfs_layout_segment *lseg;
+
+ if (!fsdata)
+ return;
+ lseg = fsdata->lseg;
+ if (!lseg)
+ return;
+ pos = fsdata->private;
+ if (!pos)
+ return;
+ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos));
+ for (; *pos != ~0; pos++) {
+ index = *pos >> (PAGE_CACHE_SHIFT - 9);
+ /* XXX How do we properly deal with failures here??? */
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page) {
+ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__);
+ continue;
+ }
+ dprintk("%s: Examining block page\n", __func__);
+ print_page(page);
+ if (!PageMappedToDisk(page)) {
+ /* XXX How do we properly deal with failures here??? */
+ dprintk("%s Marking block page\n", __func__);
+ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page,
+ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE,
+ NULL);
+ print_page(page);
+ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL);
+ if (!fake_data) {
+ printk(KERN_ERR "%s BUG BUG BUG NoMem\n",
+ __func__);
+ unlock_page(page);
+ continue;
+ }
+ get_lseg(lseg);
+ fake_data->lseg = lseg;
+ fake_data->bypass_eof = 1;
+ mapping->a_ops->write_end(filp, mapping,
+ index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE,
+ PAGE_CACHE_SIZE,
+ page, fake_data);
+ /* Note fake_data is freed by nfs_write_end */
+ } else
+ unlock_page(page);
+ }
+ kfree(fsdata->private);
+ fsdata->private = NULL;
+}
+
+/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request.
+ * Should return False if there is a reason requests can not be coalesced,
+ * otherwise, should default to returning True.
+ */
+static int
+bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ dprintk("%s enter\n", __func__);
+ if (pgio->pg_iswrite)
+ return prev->wb_lseg == req->wb_lseg;
+ else
+ return 1;
+}
+
+static struct pnfs_layoutdriver_type blocklayout_type = {
+ .id = LAYOUT_BLOCK_VOLUME,
+ .name = "LAYOUT_BLOCK_VOLUME",
+ .commit = bl_commit,
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .write_begin = bl_write_begin,
+ .write_end = bl_write_end,
+ .write_end_cleanup = bl_write_end_cleanup,
+ .alloc_layout_hdr = bl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .setup_layoutcommit = bl_setup_layoutcommit,
+ .encode_layoutcommit = bl_encode_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .clear_layoutdriver = bl_clear_layoutdriver,
+ .pg_test = bl_pg_test,
+};
+
+static int __init nfs4blocklayout_init(void)
+{
+ int ret;
+
+ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+
+ ret = pnfs_register_layoutdriver(&blocklayout_type);
+ if (!ret)
+ bl_pipe_init();
+ return ret;
+}
+
+static void __exit nfs4blocklayout_exit(void)
+{
+ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+ __func__);
+
+ pnfs_unregister_layoutdriver(&blocklayout_type);
+ bl_pipe_exit();
+}
+
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdev.c
--- linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2011-03-26 07:57:44.238821614 -0400
+++ linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2011-03-26 07:57:44.238821614 -0400
@@ -0,0 +1,334 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayoutdev.c
+ *
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h> /* __bread */
+
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes)
+{
+ uint32_t *q = p + XDR_QUADLEN(nbytes);
+ if (unlikely(q > end || q < p))
+ return NULL;
+ return p;
+}
+EXPORT_SYMBOL(blk_overflow);
+
+/* Open a block_device by device number. */
+struct block_device *nfs4_blkdev_get(dev_t dev)
+{
+ struct block_device *bd;
+
+ dprintk("%s enter\n", __func__);
+ bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR(bd))
+ goto fail;
+ return bd;
+fail:
+ dprintk("%s failed to open device : %ld\n",
+ __func__, PTR_ERR(bd));
+ return NULL;
+}
+
+/*
+ * Release the block device
+ */
+int nfs4_blkdev_put(struct block_device *bdev)
+{
+ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
+ MINOR(bdev->bd_dev));
+ return blkdev_put(bdev, FMODE_READ);
+}
+
+/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded
+ * in dev->dev_addr_buf.
+ */
+struct pnfs_block_dev *
+nfs4_blk_decode_device(struct nfs_server *server,
+ struct pnfs_device *dev,
+ struct list_head *sdlist)
+{
+ struct pnfs_block_dev *rv = NULL;
+ struct block_device *bd = NULL;
+ struct pipefs_hdr *msg = NULL, *reply = NULL;
+ uint32_t major, minor;
+
+ dprintk("%s enter\n", __func__);
+
+ if (IS_ERR(bl_device_pipe))
+ return NULL;
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
+ dev->mincount);
+ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area,
+ dev->mincount);
+ if (IS_ERR(msg)) {
+ dprintk("ERROR: couldn't make pipefs message.\n");
+ goto out_err;
+ }
+ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8);
+ msg->status = BL_DEVICE_REQUEST_INIT;
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg,
+ &bl_device_list, 0, 0);
+
+ if (IS_ERR(reply)) {
+ dprintk("ERROR: upcall_waitreply failed\n");
+ goto out_err;
+ }
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ dprintk("%s failed to open device: %ld\n",
+ __func__, PTR_ERR(bd));
+ goto out_err;
+ }
+ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t));
+ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)),
+ sizeof(uint32_t));
+ bd = nfs4_blkdev_get(MKDEV(major, minor));
+ if (IS_ERR(bd)) {
+ dprintk("%s failed to open device : %ld\n",
+ __func__, PTR_ERR(bd));
+ goto out_err;
+ }
+
+ rv = kzalloc(sizeof(*rv), GFP_KERNEL);
+ if (!rv)
+ goto out_err;
+
+ rv->bm_mdev = bd;
+ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
+ dprintk("%s Created device %s with bd_block_size %u\n",
+ __func__,
+ bd->bd_disk->disk_name,
+ bd->bd_block_size);
+ kfree(reply);
+ kfree(msg);
+ return rv;
+
+out_err:
+ kfree(rv);
+ if (!IS_ERR(reply))
+ kfree(reply);
+ if (!IS_ERR(msg))
+ kfree(msg);
+ return NULL;
+}
+
+/* Map deviceid returned by the server to constructed block_device */
+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
+ struct nfs4_deviceid *id)
+{
+ struct block_device *rv = NULL;
+ struct block_mount_id *mid;
+ struct pnfs_block_dev *dev;
+
+ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
+ mid = BLK_ID(lo);
+ spin_lock(&mid->bm_lock);
+ list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
+ if (memcmp(id->data, dev->bm_mdevid.data,
+ NFS4_DEVICEID4_SIZE) == 0) {
+ rv = dev->bm_mdev;
+ goto out;
+ }
+ }
+ out:
+ spin_unlock(&mid->bm_lock);
+ dprintk("%s returning %p\n", __func__, rv);
+ return rv;
+}
+
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ }
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
+}
+
+/* XDR decode pnfs_block_layout4 structure */
+int
+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ uint32_t *p = (uint32_t *)lgr->layout.buf;
+ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len);
+ int i, status = -EIO;
+ uint32_t count;
+ struct pnfs_block_extent *be = NULL, *save;
+ uint64_t tmp; /* Used by READSECTOR */
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> 9,
+ .inval = lgr->range.offset >> 9,
+ .cowread = lgr->range.offset >> 9,
+ };
+
+ LIST_HEAD(extents);
+
+ BLK_READBUF(p, end, 4);
+ READ32(count);
+
+ dprintk("%s enter, number of extents %i\n", __func__, count);
+ BLK_READBUF(p, end, (28 + NFS4_DEVICEID4_SIZE) * count);
+
+ /* Decode individual extents, putting them in temporary
+ * staging area until whole layout is decoded to make error
+ * recovery easier.
+ */
+ for (i = 0; i < count; i++) {
+ be = alloc_extent();
+ if (!be) {
+ status = -ENOMEM;
+ goto out_err;
+ }
+ READ_DEVID(&be->be_devid);
+ be->be_mdev = translate_devid(lo, &be->be_devid);
+ if (!be->be_mdev)
+ goto out_err;
+ /* The next three values are read in as bytes,
+ * but stored as 512-byte sector lengths
+ */
+ READ_SECTOR(be->be_f_offset);
+ READ_SECTOR(be->be_length);
+ READ_SECTOR(be->be_v_offset);
+ READ32(be->be_state);
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+ be->be_inval = &bl->bl_inval;
+ if (verify_extent(be, &lv)) {
+ dprintk("%s verify failed\n", __func__);
+ goto out_err;
+ }
+ list_add_tail(&be->be_node, &extents);
+ }
+ if (p != end) {
+ dprintk("%s Undecoded cruft at end of opaque\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ if (lgr->range.offset + lgr->range.length != lv.start << 9) {
+ dprintk("%s Final length mismatch\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ /* Extents decoded properly, now try to merge them in to
+ * existing layout extents.
+ */
+ spin_lock(&bl->bl_ext_lock);
+ list_for_each_entry_safe(be, save, &extents, be_node) {
+ list_del(&be->be_node);
+ status = add_and_merge_extent(bl, be);
+ if (status) {
+ spin_unlock(&bl->bl_ext_lock);
+ /* This is a fairly catastrophic error, as the
+ * entire layout extent lists are now corrupted.
+ * We should have some way to distinguish this.
+ */
+ be = NULL;
+ goto out_err;
+ }
+ }
+ spin_unlock(&bl->bl_ext_lock);
+ status = 0;
+ out:
+ dprintk("%s returns %i\n", __func__, status);
+ return status;
+
+ out_err:
+ put_extent(be);
+ while (!list_empty(&extents)) {
+ be = list_first_entry(&extents, struct pnfs_block_extent,
+ be_node);
+ list_del(&be->be_node);
+ put_extent(be);
+ }
+ goto out;
+}
diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdm.c
--- linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2011-03-26 07:57:44.239821607 -0400
+++ linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2011-03-26 07:57:44.239821607 -0400
@@ -0,0 +1,120 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayoutdm.c
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Fred Isaman <iisaman@umich.edu>
+ * Andy Adamson <andros@citi.umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/genhd.h> /* gendisk - used in a dprintk*/
+#include <linux/sched.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/* Defines used for calculating memory usage in nfs4_blk_flatten() */
+#define ARGSIZE 24 /* Max bytes needed for linear target arg string */
+#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE)
+#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE)
+#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \
+ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE)
+#define roundup8(x) (((x)+7) & ~7)
+#define sizeof8(x) roundup8(sizeof(x))
+
+static int dev_remove(dev_t dev)
+{
+ int ret = 1;
+ struct pipefs_hdr *msg = NULL, *reply = NULL;
+ uint64_t bl_dev;
+ uint32_t major = MAJOR(dev), minor = MINOR(dev);
+
+ dprintk("Entering %s\n", __func__);
+
+ if (IS_ERR(bl_device_pipe))
+ return ret;
+
+ memcpy((void *)&bl_dev, &major, sizeof(uint32_t));
+ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t));
+ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev,
+ sizeof(uint64_t));
+ if (IS_ERR(msg)) {
+ dprintk("ERROR: couldn't make pipefs message.\n");
+ goto out;
+ }
+ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8);
+ msg->status = BL_DEVICE_REQUEST_INIT;
+
+ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg,
+ &bl_device_list, 0, 0);
+ if (IS_ERR(reply)) {
+ dprintk("ERROR: upcall_waitreply failed\n");
+ goto out;
+ }
+
+ if (reply->status == BL_DEVICE_REQUEST_PROC)
+ ret = 0; /*TODO: what to return*/
+out:
+ if (!IS_ERR(reply))
+ kfree(reply);
+ if (!IS_ERR(msg))
+ kfree(msg);
+ return ret;
+}
+
+/*
+ * Release meta device
+ */
+static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
+{
+ int rv;
+
+ dprintk("%s Releasing\n", __func__);
+ /* XXX Check return? */
+ rv = nfs4_blkdev_put(bdev->bm_mdev);
+ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv);
+
+ rv = dev_remove(bdev->bm_mdev->bd_dev);
+ dprintk("%s Returns %d\n", __func__, rv);
+ return rv;
+}
+
+void free_block_dev(struct pnfs_block_dev *bdev)
+{
+ if (bdev) {
+ if (bdev->bm_mdev) {
+ dprintk("%s Removing DM device: %d:%d\n",
+ __func__,
+ MAJOR(bdev->bm_mdev->bd_dev),
+ MINOR(bdev->bm_mdev->bd_dev));
+ /* XXX Check status ?? */
+ nfs4_blk_metadev_release(bdev);
+ }
+ kfree(bdev);
+ }
+}
diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.h
--- linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2011-03-26 07:57:44.237821622 -0400
+++ linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.h 2011-03-26 07:57:44.238821614 -0400
@@ -0,0 +1,302 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/dm-ioctl.h> /* Needed for struct dm_ioctl*/
+#include "../pnfs.h"
+
+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9)
+
+#define PG_pnfserr PG_owner_priv_1
+#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags)
+#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags)
+#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags)
+
+extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */
+extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */
+extern int dm_do_resume(struct dm_ioctl *param);
+extern int dm_table_load(struct dm_ioctl *param, size_t param_size);
+
+struct block_mount_id {
+ spinlock_t bm_lock; /* protects list */
+ struct list_head bm_devlist; /* holds pnfs_block_dev */
+};
+
+struct pnfs_block_dev {
+ struct list_head bm_node;
+ struct nfs4_deviceid bm_mdevid; /* associated devid */
+ struct block_device *bm_mdev; /* meta device itself */
+};
+
+/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */
+struct visible_block_device {
+ struct list_head vi_node;
+ struct block_device *vi_bdev;
+ int vi_mapped;
+ int vi_put_done;
+};
+
+enum blk_vol_type {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */
+ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */
+ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */
+ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */
+};
+
+/* All disk offset/lengths are stored in 512-byte sectors */
+struct pnfs_blk_volume {
+ uint32_t bv_type;
+ sector_t bv_size;
+ struct pnfs_blk_volume **bv_vols;
+ int bv_vol_n;
+ union {
+ dev_t bv_dev;
+ sector_t bv_stripe_unit;
+ sector_t bv_offset;
+ };
+};
+
+/* Since components need not be aligned, cannot use sector_t */
+struct pnfs_blk_sig_comp {
+ int64_t bs_offset; /* In bytes */
+ uint32_t bs_length; /* In bytes */
+ char *bs_string;
+};
+
+/* Maximum number of signatures components in a simple volume */
+# define PNFS_BLOCK_MAX_SIG_COMP 16
+
+struct pnfs_blk_sig {
+ int si_num_comps;
+ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP];
+};
+
+enum exstate4 {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
+ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
+};
+
+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+
+struct my_tree_t {
+ sector_t mtt_step_size; /* Internal sector alignment */
+ struct list_head mtt_stub; /* Should be a radix tree */
+};
+
+struct pnfs_inval_markings {
+ spinlock_t im_lock;
+ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */
+ sector_t im_block_size; /* Server blocksize in sectors */
+};
+
+struct pnfs_inval_tracking {
+ struct list_head it_link;
+ int it_sector;
+ int it_tags;
+};
+
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+ struct kref be_refcnt;
+ struct list_head be_node; /* link into lseg list */
+ struct nfs4_deviceid be_devid; /* STUB - remevable??? */
+ struct block_device *be_mdev;
+ sector_t be_f_offset; /* the starting offset in the file */
+ sector_t be_length; /* the size of the extent */
+ sector_t be_v_offset; /* the starting offset in the volume */
+ enum exstate4 be_state; /* the state of this extent */
+ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+};
+
+/* Shortened extent used by LAYOUTCOMMIT */
+struct pnfs_block_short_extent {
+ struct list_head bse_node;
+ struct nfs4_deviceid bse_devid; /* STUB - removable??? */
+ struct block_device *bse_mdev;
+ sector_t bse_f_offset; /* the starting offset in the file */
+ sector_t bse_length; /* the size of the extent */
+};
+
+static inline void
+INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
+{
+ spin_lock_init(&marks->im_lock);
+ INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+ marks->im_block_size = blocksize;
+ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
+ blocksize);
+}
+
+enum extentclass4 {
+ RW_EXTENT = 0, /* READWRTE and INVAL */
+ RO_EXTENT = 1, /* READ and NONE */
+ EXTENT_LISTS = 2,
+};
+
+static inline int choose_list(enum exstate4 state)
+{
+ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
+ return RO_EXTENT;
+ else
+ return RW_EXTENT;
+}
+
+struct pnfs_block_layout {
+ struct pnfs_layout_hdr bl_layout;
+ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+ spinlock_t bl_ext_lock; /* Protects list manipulation */
+ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
+ struct list_head bl_commit; /* Needs layout commit */
+ unsigned int bl_count; /* entries in bl_commit */
+ sector_t bl_blocksize; /* Server blocksize in sectors */
+};
+
+/* this struct is comunicated between:
+ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit
+ */
+struct bl_layoutupdate_data {
+ struct list_head ranges;
+};
+
+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
+
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+ return BLK_LO2EXT(lseg->pls_layout);
+}
+
+uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes);
+
+#define BLK_READBUF(p, e, nbytes) do { \
+ p = blk_overflow(p, e, nbytes); \
+ if (!p) { \
+ printk(KERN_WARNING \
+ "%s: reply buffer overflowed in line %d.\n", \
+ __func__, __LINE__); \
+ goto out_err; \
+ } \
+} while (0)
+
+#define READ32(x) (x) = ntohl(*p++)
+#define READ64(x) do { \
+ (x) = (uint64_t)ntohl(*p++) << 32; \
+ (x) |= ntohl(*p++); \
+} while (0)
+#define COPYMEM(x, nbytes) do { \
+ memcpy((x), p, nbytes); \
+ p += XDR_QUADLEN(nbytes); \
+} while (0)
+#define READ_DEVID(x) COPYMEM((x)->data, NFS4_DEVICEID4_SIZE)
+#define READ_SECTOR(x) do { \
+ READ64(tmp); \
+ if (tmp & 0x1ff) { \
+ printk(KERN_WARNING \
+ "%s Value not 512-byte aligned at line %d\n", \
+ __func__, __LINE__); \
+ goto out_err; \
+ } \
+ (x) = tmp >> 9; \
+} while (0)
+
+#define WRITE32(n) do { \
+ *p++ = htonl(n); \
+ } while (0)
+#define WRITE64(n) do { \
+ *p++ = htonl((uint32_t)((n) >> 32)); \
+ *p++ = htonl((uint32_t)(n)); \
+} while (0)
+#define WRITEMEM(ptr, nbytes) do { \
+ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \
+} while (0)
+#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_DEVICEID4_SIZE)
+
+/* blocklayoutdev.c */
+struct block_device *nfs4_blkdev_get(dev_t dev);
+int nfs4_blkdev_put(struct block_device *bdev);
+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
+ struct pnfs_device *dev,
+ struct list_head *sdlist);
+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr);
+int nfs4_blk_create_block_disk_list(struct list_head *);
+void nfs4_blk_destroy_disk_list(struct list_head *);
+/* blocklayoutdm.c */
+int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *);
+void free_block_dev(struct pnfs_block_dev *bdev);
+/* extents.c */
+struct pnfs_block_extent *
+find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent **cow_read);
+int mark_initialized_sectors(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length,
+ sector_t **pages);
+void put_extent(struct pnfs_block_extent *be);
+struct pnfs_block_extent *alloc_extent(void);
+struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be);
+int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect);
+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg);
+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ const struct nfs4_layoutcommit_args *arg,
+ int status);
+int add_and_merge_extent(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new);
+int mark_for_commit(struct pnfs_block_extent *be,
+ sector_t offset, sector_t length);
+
+#include <linux/sunrpc/simple_rpc_pipefs.h>
+
+extern struct pipefs_list bl_device_list;
+extern struct dentry *bl_device_pipe;
+
+int bl_pipe_init(void);
+void bl_pipe_exit(void);
+
+#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
+
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/extents.c
--- linux-2.6.38.noarch/fs/nfs/blocklayout/extents.c.orig 2011-03-26 07:57:44.240821600 -0400
+++ linux-2.6.38.noarch/fs/nfs/blocklayout/extents.c 2011-03-26 07:57:44.240821600 -0400
@@ -0,0 +1,948 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include "blocklayout.h"
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/* Bit numbers */
+#define EXTENT_INITIALIZED 0
+#define EXTENT_WRITTEN 1
+#define EXTENT_IN_COMMIT 2
+#define INTERNAL_EXISTS MY_MAX_TAGS
+#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
+
+/* Returns largest t<=s s.t. t%base==0 */
+static inline sector_t normalize(sector_t s, int base)
+{
+ sector_t tmp = s; /* Since do_div modifies its argument */
+ return s - do_div(tmp, base);
+}
+
+static inline sector_t normalize_up(sector_t s, int base)
+{
+ return normalize(s + base - 1, base);
+}
+
+/* Complete stub using list while determine API wanted */
+
+/* Returns tags, or negative */
+static int32_t _find_entry(struct my_tree_t *tree, u64 s)
+{
+ struct pnfs_inval_tracking *pos;
+
+ dprintk("%s(%llu) enter\n", __func__, s);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector > s)
+ continue;
+ else if (pos->it_sector == s)
+ return pos->it_tags & INTERNAL_MASK;
+ else
+ break;
+ }
+ return -ENOENT;
+}
+
+static inline
+int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag)
+{
+ int32_t tags;
+
+ dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
+ s = normalize(s, tree->mtt_step_size);
+ tags = _find_entry(tree, s);
+ if ((tags < 0) || !(tags & (1 << tag)))
+ return 0;
+ else
+ return 1;
+}
+
+/* Creates entry with tag, or if entry already exists, unions tag to it.
+ * If storage is not NULL, newly created entry will use it.
+ * Returns number of entries added, or negative on error.
+ */
+static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag,
+ struct pnfs_inval_tracking *storage)
+{
+ int found = 0;
+ struct pnfs_inval_tracking *pos;
+
+ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector > s)
+ continue;
+ else if (pos->it_sector == s) {
+ found = 1;
+ break;
+ } else
+ break;
+ }
+ if (found) {
+ pos->it_tags |= (1 << tag);
+ return 0;
+ } else {
+ struct pnfs_inval_tracking *new;
+ if (storage)
+ new = storage;
+ else {
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+ }
+ new->it_sector = s;
+ new->it_tags = (1 << tag);
+ list_add(&new->it_link, &pos->it_link);
+ return 1;
+ }
+}
+
+/* XXXX Really want option to not create */
+/* Over range, unions tag with existing entries, else creates entry with tag */
+static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length)
+{
+ u64 i;
+
+ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
+ for (i = normalize(s, tree->mtt_step_size); i < s + length;
+ i += tree->mtt_step_size)
+ if (_add_entry(tree, i, tag, NULL))
+ return -ENOMEM;
+ return 0;
+}
+
+/* Ensure that future operations on given range of tree will not malloc */
+static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length)
+{
+ u64 start, end, s;
+ int count, i, used = 0, status = -ENOMEM;
+ struct pnfs_inval_tracking **storage;
+
+ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
+ start = normalize(offset, tree->mtt_step_size);
+ end = normalize_up(offset + length, tree->mtt_step_size);
+ count = (int)(end - start) / (int)tree->mtt_step_size;
+
+ /* Pre-malloc what memory we might need */
+ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL);
+ if (!storage)
+ return -ENOMEM;
+ for (i = 0; i < count; i++) {
+ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
+ GFP_KERNEL);
+ if (!storage[i])
+ goto out_cleanup;
+ }
+
+ /* Now need lock - HOW??? */
+
+ for (s = start; s < end; s += tree->mtt_step_size)
+ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+
+ /* Unlock - HOW??? */
+ status = 0;
+
+ out_cleanup:
+ for (i = used; i < count; i++) {
+ if (!storage[i])
+ break;
+ kfree(storage[i]);
+ }
+ kfree(storage);
+ return status;
+}
+
+static void set_needs_init(sector_t *array, sector_t offset)
+{
+ sector_t *p = array;
+
+ dprintk("%s enter\n", __func__);
+ if (!p)
+ return;
+ while (*p < offset)
+ p++;
+ if (*p == offset)
+ return;
+ else if (*p == ~0) {
+ *p++ = offset;
+ *p = ~0;
+ return;
+ } else {
+ sector_t *save = p;
+ dprintk("%s Adding %llu\n", __func__, (u64)offset);
+ while (*p != ~0)
+ p++;
+ p++;
+ memmove(save + 1, save, (char *)p - (char *)save);
+ *save = offset;
+ return;
+ }
+}
+
+/* We are relying on page lock to serialize this */
+int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect)
+{
+ int rv;
+
+ spin_lock(&marks->im_lock);
+ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
+ spin_unlock(&marks->im_lock);
+ return rv;
+}
+
+/* Assume start, end already sector aligned */
+static int
+_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag)
+{
+ struct pnfs_inval_tracking *pos;
+ u64 expect = 0;
+
+ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector >= end)
+ continue;
+ if (!expect) {
+ if ((pos->it_sector == end - tree->mtt_step_size) &&
+ (pos->it_tags & (1 << tag))) {
+ expect = pos->it_sector - tree->mtt_step_size;
+ if (expect < start)
+ return 1;
+ continue;
+ } else {
+ return 0;
+ }
+ }
+ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
+ return 0;
+ expect -= tree->mtt_step_size;
+ if (expect < start)
+ return 1;
+ }
+ return 0;
+}
+
+static int is_range_written(struct pnfs_inval_markings *marks,
+ sector_t start, sector_t end)
+{
+ int rv;
+
+ spin_lock(&marks->im_lock);
+ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
+ spin_unlock(&marks->im_lock);
+ return rv;
+}
+
+/* Marks sectors in [offest, offset_length) as having been initialized.
+ * All lengths are step-aligned, where step is min(pagesize, blocksize).
+ * Notes where partial block is initialized, and helps prepare it for
+ * complete initialization later.
+ */
+/* Currently assumes offset is page-aligned */
+int mark_initialized_sectors(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length,
+ sector_t **pages)
+{
+ sector_t s, start, end;
+ sector_t *array = NULL; /* Pages to mark */
+
+ dprintk("%s(offset=%llu,len=%llu) enter\n",
+ __func__, (u64)offset, (u64)length);
+ s = max((sector_t) 3,
+ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
+ dprintk("%s set max=%llu\n", __func__, (u64)s);
+ if (pages) {
+ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL);
+ if (!array)
+ goto outerr;
+ array[0] = ~0;
+ }
+
+ start = normalize(offset, marks->im_block_size);
+ end = normalize_up(offset + length, marks->im_block_size);
+ if (_preload_range(&marks->im_tree, start, end - start))
+ goto outerr;
+
+ spin_lock(&marks->im_lock);
+
+ for (s = normalize_up(start, PAGE_CACHE_SECTORS);
+ s < offset; s += PAGE_CACHE_SECTORS) {
+ dprintk("%s pre-area pages\n", __func__);
+ /* Portion of used block is not initialized */
+ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+ set_needs_init(array, s);
+ }
+ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
+ goto out_unlock;
+ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
+ s < end; s += PAGE_CACHE_SECTORS) {
+ dprintk("%s post-area pages\n", __func__);
+ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+ set_needs_init(array, s);
+ }
+
+ spin_unlock(&marks->im_lock);
+
+ if (pages) {
+ if (array[0] == ~0) {
+ kfree(array);
+ *pages = NULL;
+ } else
+ *pages = array;
+ }
+ return 0;
+
+ out_unlock:
+ spin_unlock(&marks->im_lock);
+ outerr:
+ if (pages) {
+ kfree(array);
+ *pages = NULL;
+ }
+ return -ENOMEM;
+}
+
+/* Marks sectors in [offest, offset+length) as having been written to disk.
+ * All lengths should be block aligned.
+ */
+int mark_written_sectors(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length)
+{
+ int status;
+
+ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
+ (u64)offset, (u64)length);
+ spin_lock(&marks->im_lock);
+ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
+ spin_unlock(&marks->im_lock);
+ return status;
+}
+
+static void print_short_extent(struct pnfs_block_short_extent *be)
+{
+ dprintk("PRINT SHORT EXTENT extent %p\n", be);
+ if (be) {
+ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
+ dprintk(" be_length %llu\n", (u64)be->bse_length);
+ }
+}
+
+void print_clist(struct list_head *list, unsigned int count)
+{
+ struct pnfs_block_short_extent *be;
+ unsigned int i = 0;
+
+ dprintk("****************\n");
+ dprintk("Extent list looks like:\n");
+ list_for_each_entry(be, list, bse_node) {
+ i++;
+ print_short_extent(be);
+ }
+ if (i != count)
+ dprintk("\n\nExpected %u entries\n\n\n", count);
+ dprintk("****************\n");
+}
+
+/* Note: In theory, we should do more checking that devid's match between
+ * old and new, but if they don't, the lists are too corrupt to salvage anyway.
+ */
+/* Note this is very similar to add_and_merge_extent */
+static void add_to_commitlist(struct pnfs_block_layout *bl,
+ struct pnfs_block_short_extent *new)
+{
+ struct list_head *clist = &bl->bl_commit;
+ struct pnfs_block_short_extent *old, *save;
+ sector_t end = new->bse_f_offset + new->bse_length;
+
+ dprintk("%s enter\n", __func__);
+ print_short_extent(new);
+ print_clist(clist, bl->bl_count);
+ bl->bl_count++;
+ /* Scan for proper place to insert, extending new to the left
+ * as much as possible.
+ */
+ list_for_each_entry_safe(old, save, clist, bse_node) {
+ if (new->bse_f_offset < old->bse_f_offset)
+ break;
+ if (end <= old->bse_f_offset + old->bse_length) {
+ /* Range is already in list */
+ bl->bl_count--;
+ kfree(new);
+ return;
+ } else if (new->bse_f_offset <=
+ old->bse_f_offset + old->bse_length) {
+ /* new overlaps or abuts existing be */
+ if (new->bse_mdev == old->bse_mdev) {
+ /* extend new to fully replace old */
+ new->bse_length += new->bse_f_offset -
+ old->bse_f_offset;
+ new->bse_f_offset = old->bse_f_offset;
+ list_del(&old->bse_node);
+ bl->bl_count--;
+ kfree(old);
+ }
+ }
+ }
+ /* Note that if we never hit the above break, old will not point to a
+ * valid extent. However, in that case &old->bse_node==list.
+ */
+ list_add_tail(&new->bse_node, &old->bse_node);
+ /* Scan forward for overlaps. If we find any, extend new and
+ * remove the overlapped extent.
+ */
+ old = list_prepare_entry(new, clist, bse_node);
+ list_for_each_entry_safe_continue(old, save, clist, bse_node) {
+ if (end < old->bse_f_offset)
+ break;
+ /* new overlaps or abuts old */
+ if (new->bse_mdev == old->bse_mdev) {
+ if (end < old->bse_f_offset + old->bse_length) {
+ /* extend new to fully cover old */
+ end = old->bse_f_offset + old->bse_length;
+ new->bse_length = end - new->bse_f_offset;
+ }
+ list_del(&old->bse_node);
+ bl->bl_count--;
+ kfree(old);
+ }
+ }
+ dprintk("%s: after merging\n", __func__);
+ print_clist(clist, bl->bl_count);
+}
+
+/* Note the range described by offset, length is guaranteed to be contained
+ * within be.
+ */
+int mark_for_commit(struct pnfs_block_extent *be,
+ sector_t offset, sector_t length)
+{
+ sector_t new_end, end = offset + length;
+ struct pnfs_block_short_extent *new;
+ struct pnfs_block_layout *bl = container_of(be->be_inval,
+ struct pnfs_block_layout,
+ bl_inval);
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ mark_written_sectors(be->be_inval, offset, length);
+ /* We want to add the range to commit list, but it must be
+ * block-normalized, and verified that the normalized range has
+ * been entirely written to disk.
+ */
+ new->bse_f_offset = offset;
+ offset = normalize(offset, bl->bl_blocksize);
+ if (offset < new->bse_f_offset) {
+ if (is_range_written(be->be_inval, offset, new->bse_f_offset))
+ new->bse_f_offset = offset;
+ else
+ new->bse_f_offset = offset + bl->bl_blocksize;
+ }
+ new_end = normalize_up(end, bl->bl_blocksize);
+ if (end < new_end) {
+ if (is_range_written(be->be_inval, end, new_end))
+ end = new_end;
+ else
+ end = new_end - bl->bl_blocksize;
+ }
+ if (end <= new->bse_f_offset) {
+ kfree(new);
+ return 0;
+ }
+ new->bse_length = end - new->bse_f_offset;
+ new->bse_devid = be->be_devid;
+ new->bse_mdev = be->be_mdev;
+
+ spin_lock(&bl->bl_ext_lock);
+ /* new will be freed, either by add_to_commitlist if it decides not
+ * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
+ */
+ add_to_commitlist(bl, new);
+ spin_unlock(&bl->bl_ext_lock);
+ return 0;
+}
+
+static void print_bl_extent(struct pnfs_block_extent *be)
+{
+ dprintk("PRINT EXTENT extent %p\n", be);
+ if (be) {
+ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
+ dprintk(" be_length %llu\n", (u64)be->be_length);
+ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
+ dprintk(" be_state %d\n", be->be_state);
+ }
+}
+
+static void
+destroy_extent(struct kref *kref)
+{
+ struct pnfs_block_extent *be;
+
+ be = container_of(kref, struct pnfs_block_extent, be_refcnt);
+ dprintk("%s be=%p\n", __func__, be);
+ kfree(be);
+}
+
+void
+put_extent(struct pnfs_block_extent *be)
+{
+ if (be) {
+ dprintk("%s enter %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_put(&be->be_refcnt, destroy_extent);
+ }
+}
+
+struct pnfs_block_extent *alloc_extent(void)
+{
+ struct pnfs_block_extent *be;
+
+ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL);
+ if (!be)
+ return NULL;
+ INIT_LIST_HEAD(&be->be_node);
+ kref_init(&be->be_refcnt);
+ be->be_inval = NULL;
+ return be;
+}
+
+struct pnfs_block_extent *
+get_extent(struct pnfs_block_extent *be)
+{
+ if (be)
+ kref_get(&be->be_refcnt);
+ return be;
+}
+
+void print_elist(struct list_head *list)
+{
+ struct pnfs_block_extent *be;
+ dprintk("****************\n");
+ dprintk("Extent list looks like:\n");
+ list_for_each_entry(be, list, be_node) {
+ print_bl_extent(be);
+ }
+ dprintk("****************\n");
+}
+
+static inline int
+extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
+{
+ /* Note this assumes new->be_f_offset >= old->be_f_offset */
+ return (new->be_state == old->be_state) &&
+ ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
+ ((new->be_v_offset - old->be_v_offset ==
+ new->be_f_offset - old->be_f_offset) &&
+ new->be_mdev == old->be_mdev));
+}
+
+/* Adds new to appropriate list in bl, modifying new and removing existing
+ * extents as appropriate to deal with overlaps.
+ *
+ * See find_get_extent for list constraints.
+ *
+ * Refcount on new is already set. If end up not using it, or error out,
+ * need to put the reference.
+ *
+ * Lock is held by caller.
+ */
+int
+add_and_merge_extent(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new)
+{
+ struct pnfs_block_extent *be, *tmp;
+ sector_t end = new->be_f_offset + new->be_length;
+ struct list_head *list;
+
+ dprintk("%s enter with be=%p\n", __func__, new);
+ print_bl_extent(new);
+ list = &bl->bl_extents[choose_list(new->be_state)];
+ print_elist(list);
+
+ /* Scan for proper place to insert, extending new to the left
+ * as much as possible.
+ */
+ list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
+ if (new->be_f_offset >= be->be_f_offset + be->be_length)
+ break;
+ if (new->be_f_offset >= be->be_f_offset) {
+ if (end <= be->be_f_offset + be->be_length) {
+ /* new is a subset of existing be*/
+ if (extents_consistent(be, new)) {
+ dprintk("%s: new is subset, ignoring\n",
+ __func__);
+ put_extent(new);
+ return 0;
+ } else {
+ goto out_err;
+ }
+ } else {
+ /* |<-- be -->|
+ * |<-- new -->| */
+ if (extents_consistent(be, new)) {
+ /* extend new to fully replace be */
+ new->be_length += new->be_f_offset -
+ be->be_f_offset;
+ new->be_f_offset = be->be_f_offset;
+ new->be_v_offset = be->be_v_offset;
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ put_extent(be);
+ } else {
+ goto out_err;
+ }
+ }
+ } else if (end >= be->be_f_offset + be->be_length) {
+ /* new extent overlap existing be */
+ if (extents_consistent(be, new)) {
+ /* extend new to fully replace be */
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ put_extent(be);
+ } else {
+ goto out_err;
+ }
+ } else if (end > be->be_f_offset) {
+ /* |<-- be -->|
+ *|<-- new -->| */
+ if (extents_consistent(new, be)) {
+ /* extend new to fully replace be */
+ new->be_length += be->be_f_offset + be->be_length -
+ new->be_f_offset - new->be_length;
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ put_extent(be);
+ } else {
+ goto out_err;
+ }
+ }
+ }
+ /* Note that if we never hit the above break, be will not point to a
+ * valid extent. However, in that case &be->be_node==list.
+ */
+ list_add(&new->be_node, &be->be_node);
+ dprintk("%s: inserting new\n", __func__);
+ print_elist(list);
+ /* STUB - The per-list consistency checks have all been done,
+ * should now check cross-list consistency.
+ */
+ return 0;
+
+ out_err:
+ put_extent(new);
+ return -EIO;
+}
+
+/* Returns extent, or NULL. If a second READ extent exists, it is returned
+ * in cow_read, if given.
+ *
+ * The extents are kept in two seperate ordered lists, one for READ and NONE,
+ * one for READWRITE and INVALID. Within each list, we assume:
+ * 1. Extents are ordered by file offset.
+ * 2. For any given isect, there is at most one extents that matches.
+ */
+struct pnfs_block_extent *
+find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent **cow_read)
+{
+ struct pnfs_block_extent *be, *cow, *ret;
+ int i;
+
+ dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+ cow = ret = NULL;
+ spin_lock(&bl->bl_ext_lock);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ if (ret &&
+ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
+ break;
+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+ if (isect >= be->be_f_offset + be->be_length)
+ break;
+ if (isect >= be->be_f_offset) {
+ /* We have found an extent */
+ dprintk("%s Get %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_get(&be->be_refcnt);
+ if (!ret)
+ ret = be;
+ else if (be->be_state != PNFS_BLOCK_READ_DATA)
+ put_extent(be);
+ else
+ cow = be;
+ break;
+ }
+ }
+ }
+ spin_unlock(&bl->bl_ext_lock);
+ if (cow_read)
+ *cow_read = cow;
+ print_bl_extent(ret);
+ return ret;
+}
+
+/* Similar to find_get_extent, but called with lock held, and ignores cow */
+static struct pnfs_block_extent *
+find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
+{
+ struct pnfs_block_extent *be, *ret = NULL;
+ int i;
+
+ dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ if (ret)
+ break;
+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+ if (isect >= be->be_f_offset + be->be_length)
+ break;
+ if (isect >= be->be_f_offset) {
+ /* We have found an extent */
+ dprintk("%s Get %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_get(&be->be_refcnt);
+ ret = be;
+ break;
+ }
+ }
+ }
+ print_bl_extent(ret);
+ return ret;
+}
+
+int
+encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg)
+{
+ sector_t start, end;
+ struct pnfs_block_short_extent *lce, *save;
+ unsigned int count = 0;
+ struct bl_layoutupdate_data *bld = arg->layoutdriver_data;
+ struct list_head *ranges = &bld->ranges;
+ __be32 *p, *xdr_start;
+
+ dprintk("%s enter\n", __func__);
+ start = arg->range.offset >> 9;
+ end = start + (arg->range.length >> 9);
+ dprintk("%s set start=%llu, end=%llu\n",
+ __func__, (u64)start, (u64)end);
+
+ /* BUG - creation of bl_commit is buggy - need to wait for
+ * entire block to be marked WRITTEN before it can be added.
+ */
+ spin_lock(&bl->bl_ext_lock);
+ /* Want to adjust for possible truncate */
+ /* We now want to adjust argument range */
+
+ /* XDR encode the ranges found */
+ xdr_start = xdr_reserve_space(xdr, 8);
+ if (!xdr_start)
+ goto out;
+ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
+ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
+ if (!p)
+ break;
+ WRITE_DEVID(&lce->bse_devid);
+ WRITE64(lce->bse_f_offset << 9);
+ WRITE64(lce->bse_length << 9);
+ WRITE64(0LL);
+ WRITE32(PNFS_BLOCK_READWRITE_DATA);
+ list_del(&lce->bse_node);
+ list_add_tail(&lce->bse_node, ranges);
+ bl->bl_count--;
+ count++;
+ }
+ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
+ xdr_start[1] = cpu_to_be32(count);
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ dprintk("%s found %i ranges\n", __func__, count);
+ return 0;
+}
+
+/* Helper function to set_to_rw that initialize a new extent */
+static void
+_prep_new_extent(struct pnfs_block_extent *new,
+ struct pnfs_block_extent *orig,
+ sector_t offset, sector_t length, int state)
+{
+ kref_init(&new->be_refcnt);
+ /* don't need to INIT_LIST_HEAD(&new->be_node) */
+ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
+ new->be_mdev = orig->be_mdev;
+ new->be_f_offset = offset;
+ new->be_length = length;
+ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
+ new->be_state = state;
+ new->be_inval = orig->be_inval;
+}
+
+/* Tries to merge be with extent in front of it in list.
+ * Frees storage if not used.
+ */
+static struct pnfs_block_extent *
+_front_merge(struct pnfs_block_extent *be, struct list_head *head,
+ struct pnfs_block_extent *storage)
+{
+ struct pnfs_block_extent *prev;
+
+ if (!storage)
+ goto no_merge;
+ if (&be->be_node == head || be->be_node.prev == head)
+ goto no_merge;
+ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
+ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
+ !extents_consistent(prev, be))
+ goto no_merge;
+ _prep_new_extent(storage, prev, prev->be_f_offset,
+ prev->be_length + be->be_length, prev->be_state);
+ list_replace(&prev->be_node, &storage->be_node);
+ put_extent(prev);
+ list_del(&be->be_node);
+ put_extent(be);
+ return storage;
+
+ no_merge:
+ kfree(storage);
+ return be;
+}
+
+static u64
+set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
+{
+ u64 rv = offset + length;
+ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
+ struct pnfs_block_extent *children[3];
+ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
+ int i = 0, j;
+
+ dprintk("%s(%llu, %llu)\n", __func__, offset, length);
+ /* Create storage for up to three new extents e1, e2, e3 */
+ e1 = kmalloc(sizeof(*e1), GFP_KERNEL);
+ e2 = kmalloc(sizeof(*e2), GFP_KERNEL);
+ e3 = kmalloc(sizeof(*e3), GFP_KERNEL);
+ /* BUG - we are ignoring any failure */
+ if (!e1 || !e2 || !e3)
+ goto out_nosplit;
+
+ spin_lock(&bl->bl_ext_lock);
+ be = find_get_extent_locked(bl, offset);
+ rv = be->be_f_offset + be->be_length;
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
+ spin_unlock(&bl->bl_ext_lock);
+ goto out_nosplit;
+ }
+ /* Add e* to children, bumping e*'s krefs */
+ if (be->be_f_offset != offset) {
+ _prep_new_extent(e1, be, be->be_f_offset,
+ offset - be->be_f_offset,
+ PNFS_BLOCK_INVALID_DATA);
+ children[i++] = e1;
+ print_bl_extent(e1);
+ } else
+ merge1 = e1;
+ _prep_new_extent(e2, be, offset,
+ min(length, be->be_f_offset + be->be_length - offset),
+ PNFS_BLOCK_READWRITE_DATA);
+ children[i++] = e2;
+ print_bl_extent(e2);
+ if (offset + length < be->be_f_offset + be->be_length) {
+ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
+ be->be_f_offset + be->be_length -
+ offset - length,
+ PNFS_BLOCK_INVALID_DATA);
+ children[i++] = e3;
+ print_bl_extent(e3);
+ } else
+ merge2 = e3;
+
+ /* Remove be from list, and insert the e* */
+ /* We don't get refs on e*, since this list is the base reference
+ * set when init'ed.
+ */
+ if (i < 3)
+ children[i] = NULL;
+ new = children[0];
+ list_replace(&be->be_node, &new->be_node);
+ put_extent(be);
+ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
+ for (j = 1; j < i; j++) {
+ old = new;
+ new = children[j];
+ list_add(&new->be_node, &old->be_node);
+ }
+ if (merge2) {
+ /* This is a HACK, should just create a _back_merge function */
+ new = list_entry(new->be_node.next,
+ struct pnfs_block_extent, be_node);
+ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ /* Since we removed the base reference above, be is now scheduled for
+ * destruction.
+ */
+ put_extent(be);
+ dprintk("%s returns %llu after split\n", __func__, rv);
+ return rv;
+
+ out_nosplit:
+ kfree(e1);
+ kfree(e2);
+ kfree(e3);
+ dprintk("%s returns %llu without splitting\n", __func__, rv);
+ return rv;
+}
+
+void
+clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ const struct nfs4_layoutcommit_args *arg,
+ int status)
+{
+ struct bl_layoutupdate_data *bld = arg->layoutdriver_data;
+ struct pnfs_block_short_extent *lce, *save;
+
+ dprintk("%s status %d\n", __func__, status);
+ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) {
+ if (likely(!status)) {
+ u64 offset = lce->bse_f_offset;
+ u64 end = offset + lce->bse_length;
+
+ do {
+ offset = set_to_rw(bl, offset, end - offset);
+ } while (offset < end);
+
+ kfree(lce);
+ } else {
+ spin_lock(&bl->bl_ext_lock);
+ add_to_commitlist(bl, lce);
+ spin_unlock(&bl->bl_ext_lock);
+ }
+ }
+}
diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.38.noarch/fs/nfs/blocklayout/Makefile
--- linux-2.6.38.noarch/fs/nfs/blocklayout/Makefile.orig 2011-03-26 07:57:44.235821643 -0400
+++ linux-2.6.38.noarch/fs/nfs/blocklayout/Makefile 2011-03-26 07:57:44.235821643 -0400
@@ -0,0 +1,6 @@
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
+ extents.o block-device-discovery-pipe.o
diff -up linux-2.6.38.noarch/fs/nfs/callback.h.orig linux-2.6.38.noarch/fs/nfs/callback.h
--- linux-2.6.38.noarch/fs/nfs/callback.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/callback.h 2011-03-26 07:57:44.241821592 -0400
@@ -167,6 +167,26 @@ extern unsigned nfs4_callback_layoutreca
extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
extern void nfs4_cb_take_slot(struct nfs_client *clp);
+
+struct cb_devicenotifyitem {
+ uint32_t cbd_notify_type;
+ uint32_t cbd_layout_type;
+ struct nfs4_deviceid cbd_dev_id;
+ uint32_t cbd_immediate;
+};
+
+/* XXX: Should be dynamic up to max compound size */
+#define NFS4_DEV_NOTIFY_MAXENTRIES 10
+struct cb_devicenotifyargs {
+ struct sockaddr *addr;
+ int ndevs;
+ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES];
+};
+
+extern __be32 nfs4_callback_devicenotify(
+ struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps);
+
#endif /* CONFIG_NFS_V4_1 */
extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff -up linux-2.6.38.noarch/fs/nfs/callback_proc.c.orig linux-2.6.38.noarch/fs/nfs/callback_proc.c
--- linux-2.6.38.noarch/fs/nfs/callback_proc.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/callback_proc.c 2011-03-26 07:57:44.241821592 -0400
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
mark_matching_lsegs_invalid(lo, &free_me_list,
- args->cbl_range.iomode))
+ &args->cbl_range))
rv = NFS4ERR_DELAY;
else
rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,14 +184,14 @@ static u32 initiate_bulk_draining(struct
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+ if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
rv = NFS4ERR_DELAY;
list_del_init(&lo->plh_bulk_recall);
spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
put_layout_hdr(lo);
iput(ino);
}
- pnfs_free_lseg_list(&free_me_list);
return rv;
}
@@ -241,6 +241,36 @@ static void pnfs_recall_all_layouts(stru
do_callback_layoutrecall(clp, &args);
}
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps)
+{
+ int i;
+ u32 type, res = 0;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (!cps->clp) {
+ res = NFS4ERR_OP_NOT_IN_SESSION;
+ goto out;
+ }
+
+ for (i = 0; i < args->ndevs; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+ type = dev->cbd_notify_type;
+ if (type == NOTIFY_DEVICEID4_DELETE && cps->clp->cl_devid_cache)
+ pnfs_delete_deviceid(cps->clp->cl_devid_cache,
+ &dev->cbd_dev_id);
+ else if (type == NOTIFY_DEVICEID4_CHANGE)
+ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE "
+ "not supported\n", __func__);
+ }
+
+out:
+ dprintk("%s: exit with status = %u\n",
+ __func__, res);
+ return cpu_to_be32(res);
+}
+
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{
if (delegation == NULL)
diff -up linux-2.6.38.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.38.noarch/fs/nfs/callback_xdr.c
--- linux-2.6.38.noarch/fs/nfs/callback_xdr.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/callback_xdr.c 2011-03-26 07:57:44.242821583 -0400
@@ -25,6 +25,7 @@
#if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
4 + 1 + 3)
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
return status;
}
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_devicenotifyargs *args)
+{
+ __be32 *p;
+ __be32 status = 0;
+ u32 tmp;
+ int n, i;
+ args->ndevs = 0;
+
+ args->addr = svc_addr(rqstp);
+
+ /* Num of device notifications */
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_RESOURCE);
+ goto out;
+ }
+ n = ntohl(*p++);
+ if (n <= 0)
+ goto out;
+
+ /* XXX: need to possibly return error in this case */
+ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) {
+ dprintk("%s: Processing (%d) notifications out of (%d)\n",
+ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n);
+ n = NFS4_DEV_NOTIFY_MAXENTRIES;
+ }
+
+ /* Decode each dev notification */
+ for (i = 0; i < n; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_RESOURCE);
+ goto out;
+ }
+
+ tmp = ntohl(*p++); /* bitmap size */
+ if (tmp != 1) {
+ status = htonl(NFS4ERR_INVAL);
+ goto out;
+ }
+ dev->cbd_notify_type = ntohl(*p++);
+ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+ status = htonl(NFS4ERR_INVAL);
+ goto out;
+ }
+
+ tmp = ntohl(*p++); /* opaque size */
+ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+ status = htonl(NFS4ERR_INVAL);
+ goto out;
+ }
+ dev->cbd_layout_type = ntohl(*p++);
+ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
+ }
+ dev->cbd_immediate = ntohl(*p++);
+ } else {
+ dev->cbd_immediate = 0;
+ }
+
+ args->ndevs++;
+
+ dprintk("%s: type %d layout 0x%x immediate %d\n",
+ __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+ dev->cbd_immediate);
+ }
+out:
+ dprintk("%s: status %d ndevs %d\n",
+ __func__, ntohl(status), args->ndevs);
+ return status;
+}
+
static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned in
case OP_CB_RECALL_ANY:
case OP_CB_RECALL_SLOT:
case OP_CB_LAYOUTRECALL:
+ case OP_CB_NOTIFY_DEVICEID:
*op = &callback_ops[op_nr];
break;
- case OP_CB_NOTIFY_DEVICEID:
case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG:
case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[]
(callback_decode_arg_t)decode_layoutrecall_args,
.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
},
+ [OP_CB_NOTIFY_DEVICEID] = {
+ .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+ .decode_args =
+ (callback_decode_arg_t)decode_devicenotify_args,
+ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+ },
[OP_CB_SEQUENCE] = {
.process_op = (callback_process_op_t)nfs4_callback_sequence,
.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff -up linux-2.6.38.noarch/fs/nfs/client.c.orig linux-2.6.38.noarch/fs/nfs/client.c
--- linux-2.6.38.noarch/fs/nfs/client.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/client.c 2011-03-26 07:57:44.244821565 -0400
@@ -404,7 +404,7 @@ static int nfs_sockaddr_match_ipaddr(con
* Test if two socket addresses represent the same actual socket,
* by comparing (only) relevant fields, including the port number.
*/
-static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+int nfs_sockaddr_cmp(const struct sockaddr *sa1,
const struct sockaddr *sa2)
{
if (sa1->sa_family != sa2->sa_family)
@@ -418,6 +418,7 @@ static int nfs_sockaddr_cmp(const struct
}
return 0;
}
+EXPORT_SYMBOL(nfs_sockaddr_cmp);
/* Common match routine for v4.0 and v4.1 callback services */
bool
@@ -567,6 +568,7 @@ int nfs4_check_client_ready(struct nfs_c
return -EPROTONOSUPPORT;
return 0;
}
+EXPORT_SYMBOL(nfs4_check_client_ready);
/*
* Initialise the timeout values for a connection
@@ -889,7 +891,7 @@ error:
/*
* Load up the server record from information gained in an fsinfo record
*/
-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
+static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo)
{
unsigned long max_rpc_payload;
@@ -919,7 +921,9 @@ static void nfs_server_set_fsinfo(struct
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- set_pnfs_layoutdriver(server, fsinfo->layouttype);
+ server->pnfs_blksize = fsinfo->blksize;
+ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
+ pnfs_set_ds_iosize(server);
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
@@ -965,7 +969,7 @@ static int nfs_probe_fsinfo(struct nfs_s
if (error < 0)
goto out_error;
- nfs_server_set_fsinfo(server, &fsinfo);
+ nfs_server_set_fsinfo(server, mntfh, &fsinfo);
/* Get some general file system info */
if (server->namelen == 0) {
@@ -1355,7 +1359,7 @@ error:
/*
* Set up an NFS4 client
*/
-static int nfs4_set_client(struct nfs_server *server,
+int nfs4_set_client(struct nfs_server *server,
const char *hostname,
const struct sockaddr *addr,
const size_t addrlen,
@@ -1398,6 +1402,7 @@ error:
dprintk("<-- nfs4_set_client() = xerror %d\n", error);
return error;
}
+EXPORT_SYMBOL(nfs4_set_client);
/*
diff -up linux-2.6.38.noarch/fs/nfsd/bl_com.c.orig linux-2.6.38.noarch/fs/nfsd/bl_com.c
--- linux-2.6.38.noarch/fs/nfsd/bl_com.c.orig 2011-03-26 07:57:44.279821268 -0400
+++ linux-2.6.38.noarch/fs/nfsd/bl_com.c 2011-03-26 07:57:44.279821268 -0400
@@ -0,0 +1,292 @@
+#if defined(CONFIG_SPNFS_BLOCK)
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/sched.h>
+#include <linux/exportfs.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/proc_fs.h>
+#include <linux/nfs_fs.h>
+
+#include <linux/nfsd/debug.h>
+#include <linux/nfsd4_block.h>
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+ char __user *, size_t);
+static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
+
+static struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = bl_pipe_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
+bl_comm_t *bl_comm_global;
+
+int
+nfsd_bl_start(void)
+{
+ bl_comm_t *bl_comm = NULL;
+ struct path path;
+ struct nameidata nd;
+ int rc;
+
+ dprintk("%s: starting pipe\n", __func__);
+ if (bl_comm_global)
+ return -EEXIST;
+
+ path.mnt = rpc_get_mount();
+ if (IS_ERR(path.mnt))
+ return PTR_ERR(path.mnt);
+
+ /* FIXME: do not abuse rpc_pipefs/nfs */
+ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
+ if (rc)
+ goto err;
+
+ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL);
+ if (!bl_comm) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ /* FIXME: rename to "spnfs_block" */
+ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm,
+ &bl_upcall_ops, 0);
+ if (IS_ERR(bl_comm->pipe_dentry)) {
+ rc = -EPIPE;
+ goto err;
+ }
+ mutex_init(&bl_comm->lock);
+ mutex_init(&bl_comm->pipe_lock);
+ init_waitqueue_head(&bl_comm->pipe_wq);
+
+ bl_comm_global = bl_comm;
+ return 0;
+err:
+ rpc_put_mount();
+ kfree(bl_comm);
+ return rc;
+}
+
+void
+nfsd_bl_stop(void)
+{
+ bl_comm_t *c = bl_comm_global;
+
+ dprintk("%s: stopping pipe\n", __func__);
+ if (!c)
+ return;
+ rpc_unlink(c->pipe_dentry);
+ rpc_put_mount();
+ bl_comm_global = NULL;
+ kfree(c);
+}
+
+static ssize_t
+bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst,
+ size_t buflen)
+{
+ char *data = (char *)msg->data + msg->copied;
+ ssize_t mlen = msg->len - msg->copied,
+ left;
+
+ if (mlen > buflen)
+ mlen = buflen;
+
+ left = copy_to_user(dst, data, mlen);
+ if (left < 0) {
+ msg->errno = left;
+ return left;
+ }
+ mlen -= left;
+ msg->copied += mlen;
+ msg->errno = 0;
+
+ return mlen;
+}
+
+static ssize_t
+bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
+ bl_comm_t *bc = (bl_comm_t *)rpci->private;
+ bl_comm_msg_t *im = &bc->msg;
+ int ret;
+ bl_comm_res_t *res;
+
+
+ if (mlen == 0) {
+ im->msg_status = PNFS_BLOCK_FAILURE;
+ im->msg_res = NULL;
+ wake_up(&bc->pipe_wq);
+ return -EFAULT;
+ }
+
+ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(res, src, mlen)) {
+ kfree(res);
+ return -EFAULT;
+ }
+
+ mutex_lock(&bc->pipe_lock);
+
+ ret = mlen;
+ im->msg_status = res->res_status;
+ im->msg_res = res;
+
+ wake_up(&bc->pipe_wq);
+ mutex_unlock(&bc->pipe_lock);
+ return ret;
+}
+
+static void
+bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ bl_comm_msg_t *im = msg->data;
+ bl_comm_t *bc = container_of(im, struct bl_comm, msg);
+
+ if (msg->errno >= 0)
+ return;
+
+ mutex_lock(&bc->pipe_lock);
+ im->msg_status = PNFS_BLOCK_FAILURE;
+ wake_up(&bc->pipe_wq);
+ mutex_unlock(&bc->pipe_lock);
+}
+
+int
+bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res)
+{
+ struct rpc_pipe_msg msg;
+ DECLARE_WAITQUEUE(wq, current);
+ int rval = 1;
+ bl_comm_msg_t *m = &bc->msg;
+
+ if (bc == NULL) {
+ dprintk("%s: No pNFS block daemon available\n", __func__);
+ return 1;
+ }
+
+ mutex_lock(&bc->lock);
+ mutex_lock(&bc->pipe_lock);
+
+ memcpy(m, upmsg, sizeof (*m));
+
+ memset(&msg, 0, sizeof (msg));
+ msg.data = m;
+ msg.len = sizeof (*m);
+
+ add_wait_queue(&bc->pipe_wq, &wq);
+ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg);
+ if (rval < 0) {
+ remove_wait_queue(&bc->pipe_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&bc->pipe_lock);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&bc->pipe_wq, &wq);
+ mutex_lock(&bc->pipe_lock);
+
+ if (m->msg_status == PNFS_BLOCK_SUCCESS) {
+ *res = m->msg_res;
+ rval = 0;
+ } else
+ rval = 1;
+
+out:
+ mutex_unlock(&bc->pipe_lock);
+ mutex_unlock(&bc->lock);
+ return rval;
+}
+
+static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len,
+ loff_t *offset)
+{
+ int cmd,
+ rc;
+ bl_comm_t *bc = bl_comm_global;
+ bl_comm_msg_t msg;
+ bl_comm_res_t *res;
+
+ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int)))
+ return -EFAULT;
+ switch (cmd) {
+ case PNFS_BLOCK_CTL_STOP:
+ msg.msg_type = PNFS_UPCALL_MSG_STOP;
+ (void) bl_upcall(bc, &msg, &res);
+ kfree(res);
+ nfsd_bl_stop();
+ break;
+
+ case PNFS_BLOCK_CTL_START:
+ rc = nfsd_bl_start();
+ if (rc != 0)
+ return rc;
+ break;
+
+ case PNFS_BLOCK_CTL_VERS:
+ msg.msg_type = PNFS_UPCALL_MSG_VERS;
+ msg.u.msg_vers = PNFS_UPCALL_VERS;
+ if (bl_upcall(bc, &msg, &res)) {
+ dprintk("%s: Failed to contact pNFS block daemon\n",
+ __func__);
+ return 0;
+ }
+ kfree(res);
+ break;
+
+ default:
+ dprintk("%s: unknown ctl command %d\n", __func__, cmd);
+ break;
+ }
+ return len;
+}
+
+static struct file_operations ctl_ops = {
+ .write = ctl_write,
+};
+
+/*
+ * bl_init_proc -- set up proc interfaces
+ *
+ * Creating a pnfs_block directory isn't really required at this point
+ * since we've only got a single node in that directory. If the need for
+ * more nodes doesn't present itself shortly this code should revert
+ * to a single top level node. McNeal 11-Aug-2008.
+ */
+int
+bl_init_proc(void)
+{
+ struct proc_dir_entry *e;
+
+ e = proc_mkdir("fs/pnfs_block", NULL);
+ if (!e)
+ return -ENOMEM;
+
+ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL);
+ if (!e)
+ return -ENOMEM;
+ e->proc_fops = &ctl_ops;
+
+ return 0;
+}
+#endif /* CONFIG_SPNFS_BLOCK */
diff -up linux-2.6.38.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.38.noarch/fs/nfsd/bl_ops.c
--- linux-2.6.38.noarch/fs/nfsd/bl_ops.c.orig 2011-03-26 07:57:44.281821252 -0400
+++ linux-2.6.38.noarch/fs/nfsd/bl_ops.c 2011-03-26 07:57:44.281821252 -0400
@@ -0,0 +1,1672 @@
+/*
+ * bl_ops.c
+ * spNFS
+ *
+ * Created by Rick McNeal on 4/1/08.
+ * Copyright 2008 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+/*
+ * Block layout operations.
+ *
+ * These functions, with the exception of pnfs_block_enabled, are assigned to
+ * the super block s_export_op structure.
+ */
+#if defined(CONFIG_SPNFS_BLOCK)
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/nfsd4_spnfs.h>
+#include <linux/nfsd/nfs4layoutxdr.h>
+#include <linux/nfsd/export.h>
+#include <linux/nfsd/nfsd4_pnfs.h>
+#include <linux/nfsd/debug.h>
+#include <linux/spinlock_types.h>
+#include <linux/dm-ioctl.h>
+#include <asm/uaccess.h>
+#include <linux/falloc.h>
+#include <linux/nfsd4_block.h>
+
+#include "pnfsd.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define BL_LAYOUT_HASH_BITS 4
+#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS)
+#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1)
+#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256)
+
+#define bl_layout_hashval(id) \
+ ((id) & BL_LAYOUT_HASH_MASK)
+
+#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len)
+#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len)
+#define _2SECTS(v) ((v) >> 9)
+
+#ifndef READ32
+#define READ32(x) (x) = ntohl(*p++)
+#define READ64(x) do { \
+(x) = (u64)ntohl(*p++) << 32; \
+(x) |= ntohl(*p++); \
+} while (0)
+#endif
+
+
+typedef enum {True, False} boolean_t;
+/* ---- block layoutget and commit structure ---- */
+typedef struct bl_layout_rec {
+ struct list_head blr_hash,
+ blr_layouts;
+ dev_t blr_rdev;
+ struct inode *blr_inode;
+ int blr_recalled; // debug
+ u64 blr_orig_size,
+ blr_commit_size,
+ blr_ext_size;
+ spinlock_t blr_lock; // Protects blr_layouts
+} bl_layout_rec_t;
+
+static struct list_head layout_hash;
+static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE];
+static spinlock_t layout_hashtbl_lock;
+
+/* ---- prototypes ---- */
+static boolean_t device_slice(dev_t devid);
+static boolean_t device_dm(dev_t devid);
+static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **);
+static bl_layout_rec_t *layout_inode_find(struct inode *i);
+static void layout_inode_del(struct inode *i);
+static char *map_state2name(enum pnfs_block_extent_state4 s);
+static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type);
+static void bld_free(pnfs_blocklayout_devinfo_t *bld);
+static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes,
+ dev_t devid, int local_index);
+static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes,
+ dev_t devid, int my_loc, int idx);
+static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
+ struct nfsd4_layout_seg *seg);
+struct list_head *layout_cache_iter(bl_layout_rec_t *r,
+ struct list_head *bl_possible, struct nfsd4_layout_seg *seg);
+static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h);
+static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h);
+static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg);
+static void print_bll(pnfs_blocklayout_layout_t *b, char *);
+static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r,
+ struct list_head *h, struct nfsd4_layout_seg *seg);
+static inline void bll_collapse(bl_layout_rec_t *r,
+ pnfs_blocklayout_layout_t *c);
+static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len,
+ enum bl_cache_state state, struct list_head *h);
+static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b,
+ enum bl_cache_state c, struct list_head *h);
+static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
+ enum pnfs_block_extent_state4 *s);
+static void extents_setup(struct fiemap_extent_info *fei);
+static void extents_count(struct fiemap_extent_info *fei, struct inode *i,
+ u64 foff, u64 len);
+static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i,
+ u64 foff, u64 len);
+static boolean_t extents_process(struct fiemap_extent_info *fei,
+ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev,
+ pnfs_blocklayout_layout_t *b);
+static void extents_cleanup(struct fiemap_extent_info *fei);
+
+void
+nfsd_bl_init(void)
+{
+ int i;
+ dprintk("%s loaded\n", __func__);
+
+ spin_lock_init(&layout_hashtbl_lock);
+ INIT_LIST_HEAD(&layout_hash);
+ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&layout_hashtbl[i]);
+ bl_init_proc();
+}
+
+/*
+ * pnfs_block_enabled -- check to see if this file system should be export as
+ * block pnfs
+ */
+int
+pnfs_block_enabled(struct inode *inode, int ex_flags)
+{
+ bl_comm_msg_t msg;
+ bl_comm_res_t *res = NULL;
+ static int bl_comm_once = 0;
+
+ dprintk("--> %s\n", __func__);
+ /*
+ * FIXME: Figure out method to determine if this file system should
+ * be exported. The following areas need to be checked.
+ * (1) Validate that this file system was exported as a pNFS
+ * block-layout
+ * (2) Has there been successful communication with the
+ * volume daemon?
+ */
+ /* Check #1 */
+#ifdef notyet
+ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) {
+ dprintk("%s: pnfs_block not set in export\n", __func__);
+ return 0;
+ }
+#endif
+
+ /* Check #1 */
+ if (!bl_comm_once) {
+ msg.msg_type = PNFS_UPCALL_MSG_VERS;
+ msg.u.msg_vers = PNFS_UPCALL_VERS;
+ if (bl_upcall(bl_comm_global, &msg, &res)) {
+ dprintk("%s: Failed to contact pNFS block daemon\n",
+ __func__);
+ return 0;
+ }
+ if (msg.u.msg_vers != res->u.vers) {
+ dprintk("%s: vers mismatch, kernel != daemon\n",
+ __func__);
+ kfree(res);
+ return 0;
+ }
+ }
+ bl_comm_once = 1;
+
+ kfree(res);
+
+ dprintk("<-- %s okay\n", __func__);
+ return 1;
+}
+
+int
+bl_layout_type(struct super_block *sb)
+{
+ return LAYOUT_BLOCK_VOLUME;
+}
+
+int
+bl_getdeviceiter(struct super_block *sb,
+ u32 layout_type,
+ struct nfsd4_pnfs_dev_iter_res *res)
+{
+ res->gd_eof = 1;
+ if (res->gd_cookie)
+ return -ENOENT;
+ res->gd_devid = sb->s_dev;
+ res->gd_verf = 1;
+ res->gd_cookie = 1;
+ return 0;
+}
+
+static int
+bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_deviceid *devid)
+{
+ pnfs_blocklayout_devinfo_t *bld_slice_p,
+ *bld_simple_p,
+ *bld;
+ int status = -EIO,
+ location = 0;
+ struct list_head volumes;
+
+ dprintk("--> %s\n", __func__);
+ INIT_LIST_HEAD(&volumes);
+
+ bld_simple_p = bld_simple(&volumes, devid->devid,
+ location++);
+ if (!bld_simple_p)
+ goto out;
+ bld_slice_p = bld_slice(&volumes, devid->devid, location++,
+ bld_simple_p->bld_index_loc);
+
+ if (!bld_slice_p)
+ goto out;
+
+ status = blocklayout_encode_devinfo(xdr, &volumes);
+
+out:
+ while (!list_empty(&volumes)) {
+ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
+ bld_list);
+ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE)
+ kfree(bld->u.simple.bld_sig);
+ bld_free(bld);
+ }
+
+ dprintk("<-- %s (rval %d)\n", __func__, status);
+ return status;
+}
+
+static int
+bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_deviceid *devid)
+{
+ pnfs_blocklayout_devinfo_t *bld = NULL;
+ int status = -EIO, // default to error
+ i,
+ location = 0;
+ struct list_head volumes;
+ bl_comm_msg_t msg;
+ bl_comm_res_t *res;
+
+ dprintk("--> %s\n", __func__);
+ INIT_LIST_HEAD(&volumes);
+
+ msg.msg_type = PNFS_UPCALL_MSG_DMGET;
+ msg.u.msg_dev = devid->devid;
+ if (bl_upcall(bl_comm_global, &msg, &res)) {
+ dprintk("%s: upcall for DMGET failed\n", __func__);
+ goto out;
+ }
+
+ /*
+ * Don't use bld_alloc() here. If used this will be the first volume
+ * type added to the list whereas the protocol requires it to be the
+ * last.
+ */
+ bld = kmalloc(sizeof (*bld), GFP_KERNEL);
+ if (!bld)
+ goto out;
+ memset(bld, 0, sizeof (*bld));
+ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE;
+ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes;
+ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL;
+ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__,
+ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL);
+
+ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes *
+ sizeof (int), GFP_KERNEL);
+ if (!bld->u.stripe.bld_stripe_indexs)
+ goto out;
+
+ for (i = 0; i < bld->u.stripe.bld_stripes; i++) {
+ dev_t dev;
+ pnfs_blocklayout_devinfo_t *bldp;
+
+ dev = MKDEV(res->u.stripe.devs[i].major,
+ res->u.stripe.devs[i].minor);
+ if (dev == 0)
+ goto out;
+
+ bldp = bld_simple(&volumes, dev, location++);
+ if (!bldp) {
+ dprintk("%s: bld_simple failed\n", __func__);
+ goto out;
+ }
+ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc);
+
+ if (!bldp) {
+ dprintk("%s: bld_slice failed\n", __func__);
+ goto out;
+ }
+ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc;
+
+ }
+ list_add_tail(&bld->bld_list, &volumes);
+ status = blocklayout_encode_devinfo(xdr, &volumes);
+
+out:
+ while (!list_empty(&volumes)) {
+ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
+ bld_list);
+ switch (bld->bld_type) {
+ case PNFS_BLOCK_VOLUME_SLICE:
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ // No memory to release for these
+ break;
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ kfree(bld->u.simple.bld_sig);
+ break;
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ kfree(bld->u.stripe.bld_stripe_indexs);
+ break;
+ }
+ bld_free(bld);
+ }
+ kfree(res);
+ dprintk("<-- %s (rval %d)\n", __func__, status);
+ return status;
+}
+
+/*
+ * bl_getdeviceinfo -- determine device tree for requested devid
+ */
+int
+bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
+ u32 layout_type,
+ const struct nfsd4_pnfs_deviceid *devid)
+{
+ if (device_slice(devid->devid) == True)
+ return bl_getdeviceinfo_slice(sb, xdr, devid);
+ else if (device_dm(devid->devid) == True)
+ return bl_getdeviceinfo_dm(sb, xdr, devid);
+ return -EINVAL;
+}
+
+enum nfsstat4
+bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_layoutget_arg *arg,
+ struct nfsd4_pnfs_layoutget_res *res)
+{
+ pnfs_blocklayout_layout_t *b;
+ bl_layout_rec_t *r;
+ struct list_head bl_possible,
+ *bl_candidates = NULL;
+ boolean_t del_on_error = False;
+ int adj;
+ enum nfsstat4 nfserr = NFS4_OK;
+
+ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n",
+ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset),
+ _2SECTS(res->lg_seg.length), res->lg_seg.iomode);
+
+ if (res->lg_seg.length == 0) {
+ printk("%s: request length of 0, error condition\n", __func__);
+ return NFS4ERR_BADLAYOUT;
+ }
+
+ /*
+ * Adjust the length as required per spec.
+ * - First case is were the length is set to (u64)-1. Cheap means to
+ * define the end of the file.
+ * - Second case is were the I/O mode is read-only, but the request is
+ * past the end of the file so the request needs to be trimed.
+ */
+ if ((res->lg_seg.length == NFS4_MAX_UINT64) ||
+ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) &&
+ (res->lg_seg.iomode == IOMODE_READ)))
+ res->lg_seg.length = i->i_size - res->lg_seg.offset;
+
+ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0;
+ res->lg_seg.offset -= adj;
+ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511;
+
+ if (res->lg_seg.iomode != IOMODE_READ)
+ if (i->i_fop->fallocate(i, FALLOC_FL_KEEP_SIZE,
+ res->lg_seg.offset, res->lg_seg.length))
+ return NFS4ERR_IO;
+
+ INIT_LIST_HEAD(&bl_possible);
+
+ if ((r = layout_inode_find(i)) == NULL) {
+ if (layout_inode_add(i, &r) == False) {
+ printk("%s: layout_inode_add failed\n", __func__);
+ return NFS4ERR_IO;
+ }
+ del_on_error = True;
+ }
+ BUG_ON(!r);
+
+ spin_lock(&r->blr_lock);
+
+ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) {
+ /*
+ * This will send LAYOUTTRYAGAIN error to the client.
+ */
+ dprintk("%s: layout_cache_fill_from() failed\n", __func__);
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ goto layoutget_cleanup;
+ }
+
+ res->lg_return_on_close = 1;
+ res->lg_seg.length = 0;
+
+ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg);
+ if (!bl_candidates) {
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ goto layoutget_cleanup;
+ }
+
+ layout_cache_merge(r, bl_candidates);
+ if (layout_cache_update(r, bl_candidates)) {
+ /* ---- Failed to allocate memory. ---- */
+ dprintk("%s: layout_cache_update() failed\n", __func__);
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ goto layoutget_cleanup;
+ }
+
+ nfserr = blocklayout_encode_layout(xdr, bl_candidates);
+ if (nfserr)
+ dprintk("%s: layoutget xdr routine failed\n", __func__);
+
+layoutget_cleanup:
+ if (bl_candidates) {
+ while (!list_empty(bl_candidates)) {
+ b = list_entry(bl_candidates->next,
+ struct pnfs_blocklayout_layout, bll_list);
+ list_del(&b->bll_list);
+ kfree(b);
+ }
+ }
+
+ spin_unlock(&r->blr_lock);
+ if (unlikely(nfserr)) {
+ if (del_on_error == True)
+ layout_inode_del(i);
+ res->lg_seg.length = 0;
+ res->lg_seg.offset = 0;
+ }
+
+ dprintk("<-- %s (rval %u)\n", __func__, nfserr);
+ return nfserr;
+}
+
+/*
+ * bl_layoutcommit -- commit changes, especially size, to file systemj
+ *
+ * Currently this routine isn't called and everything is handled within
+ * nfsd4_layoutcommit(). By not calling this routine the server doesn't
+ * handle a partial return, a set of extents, of the layout. The extents
+ * are decoded here, but nothing is done with them. If this routine is
+ * be called the interface must change to pass the 'dentry' pointer such
+ * that notify_change() can be called.
+ */
+int
+bl_layoutcommit(struct inode *i,
+ const struct nfsd4_pnfs_layoutcommit_arg *args,
+ struct nfsd4_pnfs_layoutcommit_res *res)
+{
+ bl_layout_rec_t *r;
+ int status = 0;
+ u64 lw_plus;
+
+ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
+ r = layout_inode_find(i);
+ if (r) {
+ lw_plus = args->lc_last_wr + 1;
+ if (args->lc_newoffset) {
+ dprintk(" lc_last_wr %Lu\n", lw_plus);
+ if (r->blr_orig_size < lw_plus) {
+ r->blr_orig_size = lw_plus;
+ res->lc_size_chg = 1;
+ res->lc_newsize = lw_plus;
+ }
+ }
+
+ if (args->lc_up_len) {
+ int extents,
+ i;
+ struct pnfs_blocklayout_layout *b;
+ __be32 *p = args->lc_up_layout;
+
+ /*
+ * Client is returning a set of extents which
+ * should/could be used to update the file system.
+ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08
+ */
+ READ32(extents);
+ dprintk(" Client returning %d extents: data size %d\n",
+ extents, args->lc_up_len);
+ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) *
+ extents, GFP_KERNEL);
+ if (b) {
+ for (i = 0; i < extents; i++) {
+ READ64(b[i].bll_vol_id.sbid);
+ READ64(b[i].bll_vol_id.devid);
+ READ64(b[i].bll_foff);
+ READ64(b[i].bll_len);
+ READ64(b[i].bll_soff);
+ READ32(b[i].bll_es);
+ dprintk(" %d: foff %Lu, len %Lu, soff %Lu "
+ "state %s\n",
+ i, _2SECTS(b[i].bll_foff),
+ _2SECTS(b[i].bll_len),
+ _2SECTS(b[i].bll_soff),
+ map_state2name(b[i].bll_es));
+ }
+ kfree(b);
+ } else {
+ status = -ENOMEM;
+ }
+ }
+ } else
+ dprintk("%s: Unexpected commit to inode %p\n", __func__, i);
+
+ dprintk("<-- %s (rval %d)\n", __func__, status);
+ return status;
+}
+
+int
+bl_layoutreturn(struct inode *i,
+ const struct nfsd4_pnfs_layoutreturn_arg *args)
+{
+ int status = 0;
+ bl_layout_rec_t *r;
+
+ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
+
+ r = layout_inode_find(i);
+ if (r) {
+ spin_lock(&r->blr_lock);
+ layout_cache_del(r, &args->lr_seg);
+ spin_unlock(&r->blr_lock);
+ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n",
+ r->blr_ext_size, i->i_size, r->blr_orig_size);
+ }
+
+ layout_inode_del(i);
+ dprintk("<-- %s (rval %d)\n", __func__, status);
+ return status;
+}
+
+int
+bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
+{
+ struct super_block *sb;
+ struct nfsd4_pnfs_cb_layout lr;
+ bl_layout_rec_t *r;
+ pnfs_blocklayout_layout_t *b;
+ u64 adj;
+
+ dprintk("--> %s\n", __func__);
+ BUG_ON(!len);
+ switch (type) {
+ case RETURN_FILE:
+ sb = inode->i_sb;
+ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n",
+ inode->i_sb->s_dev, inode->i_ino,
+ _2SECTS(offset), _2SECTS(len));
+ break;
+ case RETURN_FSID:
+ sb = inode->i_sb;
+ dprintk("%s: recalling layout for fsid x (unimplemented)\n",
+ __func__);
+ return 0;
+ case RETURN_ALL:
+ /*
+ * XXX figure out how to get a sb since there's no
+ * inode ptr
+ */
+ dprintk("%s: recalling all layouts (unimplemented)\n",
+ __func__);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+
+restart:
+ r = layout_inode_find(inode);
+ if (r && len && !r->blr_recalled) {
+ spin_lock(&r->blr_lock);
+ list_for_each_entry(b, &r->blr_layouts, bll_list) {
+ if (!r->blr_recalled && !b->bll_recalled &&
+ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) {
+ b->bll_recalled = 1;
+ lr.cbl_recall_type = type;
+ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME;
+ lr.cbl_seg.clientid = 0;
+ lr.cbl_seg.offset = 0;
+ lr.cbl_seg.length = NFS4_MAX_UINT64;
+ r->blr_recalled = 1;
+ dprintk(" FULL LAYOUTRECALL\n");
+ lr.cbl_seg.iomode = IOMODE_ANY;
+
+ /*
+ * Currently there are only two cases where the
+ * layout is being returned.
+ * (1) Someone is issuing a NFS_WRITE operation
+ * to this layout.
+ * (2) The file has been truncated which means
+ * the layout is immediately made invalid.
+ * In both cases the client must write any
+ * uncommitted modifications to the server via
+ * NFS_WRITE.
+ */
+ lr.cbl_layoutchanged = 1;
+
+ /*
+ * Need to drop the lock because we'll get a
+ * layoutreturn which will block waiting for
+ * the lock. The request will come in on the
+ * same thread which will cause a deadlock.
+ */
+ spin_unlock(&r->blr_lock);
+ nfsd_layout_recall_cb(sb, inode, &lr);
+ adj = MIN(b->bll_len - (offset - b->bll_foff),
+ len);
+ offset += adj;
+ len -= adj;
+ if (!len) {
+ spin_lock(&r->blr_lock);
+ break;
+ }
+ /*
+ * Since layoutreturn will have been called we
+ * can't assume blr_layouts is still valid,
+ * so restart.
+ */
+ goto restart;
+ }
+ }
+ spin_unlock(&r->blr_lock);
+ }
+
+ dprintk("<-- %s\n", __func__);
+ return 0;
+}
+
+/*
+ * []------------------------------------------------------------------[]
+ * | Support functions from here on down. |
+ * []------------------------------------------------------------------[]
+ */
+
+/*
+ * bld_simple -- given a dev_t build a simple volume structure
+ *
+ * Simple volume contains the device signature and offset to that data in
+ * the storage volume.
+ */
+static pnfs_blocklayout_devinfo_t *
+bld_simple(struct list_head *volumes, dev_t devid, int local_index)
+{
+ pnfs_blocklayout_devinfo_t *bld = NULL;
+ bl_comm_msg_t msg;
+ bl_comm_res_t *res = NULL;
+
+ msg.msg_type = PNFS_UPCALL_MSG_GETSIG;
+ msg.u.msg_dev = devid;
+ if (bl_upcall(bl_comm_global, &msg, &res)) {
+ dprintk("%s: Failed to get signature information\n", __func__);
+ goto error;
+ }
+
+ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE);
+ if (!bld)
+ return NULL;
+
+ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset;
+ bld->u.simple.bld_sig_len = res->u.sig.len;
+ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL);
+ if (!bld->u.simple.bld_sig)
+ goto error;
+
+ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len);
+ kfree(res);
+ return bld;
+
+error:
+ if (bld)
+ bld_free(bld);
+ if (res)
+ kfree(res);
+ dprintk("%s: error in bld_simple\n", __func__);
+ return NULL;
+}
+
+/*
+ * bld_slice -- given a dev_t build a slice volume structure
+ *
+ * A slice volume contains the length of the slice/partition and its offset
+ * from the beginning of the storage volume. There's also a reference to
+ * the "simple" volume which contains this slice.
+ */
+static pnfs_blocklayout_devinfo_t *
+bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc)
+{
+ pnfs_blocklayout_devinfo_t *bld;
+ bl_comm_msg_t msg;
+ bl_comm_res_t *res;
+
+ dprintk("--> %s\n", __func__);
+ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE);
+ if (!bld)
+ return NULL;
+
+ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE;
+ msg.u.msg_dev = devid;
+ if (bl_upcall(bl_comm_global, &msg, &res)) {
+ dprintk("Upcall to get slice info failed\n");
+ bld_free(bld);
+ return NULL;
+ }
+
+ bld->bld_devid.devid = devid;
+ bld->bld_index_loc = my_loc;
+ bld->u.slice.bld_start = res->u.slice.start * 512LL;
+ bld->u.slice.bld_len = res->u.slice.length * 512LL;
+ bld->u.slice.bld_index = simple_loc;
+
+ dprintk("%s: start %Lu, len %Lu\n", __func__,
+ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL);
+
+ kfree(res);
+ dprintk("<-- %s (rval %p)\n", __func__, bld);
+ return bld;
+}
+
+static int
+layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
+ struct nfsd4_layout_seg *seg)
+{
+ pnfs_blocklayout_layout_t *n;
+
+ dprintk("--> %s\n", __func__);
+
+ if (!list_empty(&r->blr_layouts))
+ if (layout_cache_fill_from_list(r, h, seg) == False)
+ return -EIO;
+
+ /*
+ * This deals with two conditions.
+ * (1) When blr_layouts is empty we need to create the first entry
+ * (2) When the range requested falls past the end of any current
+ * layout the residual must be taken care of.
+ */
+ if (seg->length) {
+ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h);
+ if (!n)
+ return -ENOMEM;
+ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff),
+ _2SECTS(n->bll_len));
+ }
+
+ dprintk("<-- %s\n", __func__);
+ return 0;
+}
+
+struct list_head *
+layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible,
+ struct nfsd4_layout_seg *seg)
+{
+ pnfs_blocklayout_layout_t *b,
+ *n = NULL;
+ struct list_head *bl_candidates = NULL;
+ struct fiemap_extent_info fei;
+ struct inode *i;
+ dev_t dev;
+
+ dev = r->blr_rdev;
+ i = r->blr_inode;
+
+ dprintk("--> %s\n", __func__);
+ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL);
+ if (!bl_candidates)
+ return NULL;
+ INIT_LIST_HEAD(bl_candidates);
+ extents_setup(&fei);
+
+ list_for_each_entry(b, bl_possible, bll_list) {
+ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
+
+ extents_count(&fei, i, b->bll_foff, b->bll_len);
+ if (fei.fi_extents_mapped) {
+
+ /*
+ * Common case here. Got a range which has
+ * extents. Now get those extents and process
+ * them into pNFS extents.
+ */
+ if (extents_get(&fei, i, b->bll_foff,
+ b->bll_len) == False)
+ goto cleanup;
+ if (extents_process(&fei, bl_candidates,
+ seg, dev, b) == False)
+ goto cleanup;
+ extents_cleanup(&fei);
+
+ } else if (seg->iomode == IOMODE_READ) {
+
+ /*
+ * Found a hole in a file while reading. No
+ * problem, just create a pNFS extent for the
+ * range and let the client know there's no
+ * backing store.
+ */
+ n = bll_alloc(b->bll_foff, b->bll_len,
+ BLOCK_LAYOUT_NEW, bl_candidates);
+ n->bll_es = PNFS_BLOCK_NONE_DATA;
+ n->bll_vol_id.sbid = 0;
+ n->bll_vol_id.devid = dev;
+ seg->length += b->bll_len;
+ } else {
+
+ /*
+ * There's a problem here. Since the iomode
+ * is read/write fallocate should have allocated
+ * any necessary storage for the given range.
+ */
+ dprintk(" Extent count for RW is 0\n");
+ goto cleanup;
+ }
+
+ } else {
+ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates);
+ seg->length += n->bll_len;
+ }
+
+ if (r->blr_ext_size < (b->bll_foff + b->bll_len))
+ r->blr_ext_size = b->bll_foff + b->bll_len;
+ }
+
+ while (!list_empty(bl_possible)) {
+ b = list_entry(bl_possible->next,
+ struct pnfs_blocklayout_layout, bll_list);
+ list_del(&b->bll_list);
+ kfree(b);
+ }
+
+ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout,
+ bll_list);
+ seg->offset = b->bll_foff;
+ dprintk("<-- %s okay\n", __func__);
+ return bl_candidates;
+
+cleanup:
+ extents_cleanup(&fei);
+ if (bl_candidates)
+ kfree(bl_candidates);
+ dprintk("<-- %s, error occurred\n", __func__);
+ return NULL;
+}
+
+/*
+ * layout_cache_merge -- collapse layouts which make up a contiguous range.
+ */
+static void
+layout_cache_merge(bl_layout_rec_t *r, struct list_head *h)
+{
+ pnfs_blocklayout_layout_t *b,
+ *p;
+
+ dprintk("--> %s\n", __func__);
+restart:
+ p = NULL;
+ list_for_each_entry(b, h, bll_list) {
+ if (p && (BLL_S_END(p) == b->bll_soff) &&
+ (p->bll_es == b->bll_es) &&
+ (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
+ /*
+ * We've got a condidate.
+ */
+#ifdef too_verbose
+ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n",
+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
+ _2SECTS(b->bll_soff),
+ _2SECTS(p->bll_foff), _2SECTS(p->bll_len),
+ _2SECTS(b->bll_soff));
+#endif
+
+ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE)
+ p->bll_cache_state = BLOCK_LAYOUT_UPDATE;
+ p->bll_len += b->bll_len;
+ list_del(&b->bll_list);
+ kfree(b);
+ goto restart;
+ } else if (p && (BLL_F_END(p) == b->bll_foff) &&
+ (p->bll_es == b->bll_es) &&
+ (b->bll_es == PNFS_BLOCK_NONE_DATA)) {
+ p->bll_len += b->bll_len;
+ list_del(&b->bll_list);
+ kfree(b);
+ goto restart;
+ } else
+ p = b;
+ }
+ dprintk("<-- %s\n", __func__);
+}
+
+static int
+layout_cache_update(bl_layout_rec_t *r, struct list_head *h)
+{
+ pnfs_blocklayout_layout_t *b,
+ *c,
+ *n;
+ boolean_t status = 0;
+
+ dprintk("--> %s\n", __func__);
+ if (list_empty(&r->blr_layouts)) {
+ /* ---- Just add entries and return ---- */
+ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev,
+ r->blr_inode->i_ino);
+ list_for_each_entry(b, h, bll_list) {
+ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE,
+ &r->blr_layouts);
+ if (!c) {
+ status = -ENOMEM;
+ break;
+ }
+ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n",
+ _2SECTS(c->bll_foff), _2SECTS(c->bll_len),
+ _2SECTS(c->bll_soff), c->bll_es);
+ }
+ return status;
+ }
+
+ list_for_each_entry(b, h, bll_list) {
+ BUG_ON(!b->bll_vol_id.devid);
+ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) {
+ boolean_t found = False;
+ list_for_each_entry(c, &r->blr_layouts, bll_list) {
+ if ((b->bll_soff >= c->bll_soff) &&
+ (b->bll_soff < BLL_S_END(c)) &&
+ (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
+ u64 u;
+
+ if ((b->bll_foff < c->bll_foff) ||
+ (b->bll_foff > BLL_F_END(c)))
+ BUG();
+
+ u = BLL_S_END(b) - BLL_S_END(c);
+ /*
+ * The updated cache entry has to be
+ * different than the current.
+ * Otherwise the cache state for 'b'
+ * should be BLOCK_LAYOUT_CACHE.
+ */
+ BUG_ON(BLL_S_END(b) < BLL_S_END(c));
+
+ dprintk(" "
+ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n",
+ _2SECTS(c->bll_foff),
+ _2SECTS(c->bll_len),
+ _2SECTS(c->bll_soff),
+ _2SECTS(c->bll_len + u));
+ c->bll_len += u;
+ bll_collapse(r, c);
+ found = True;
+ break;
+ }
+ }
+
+ if (found == False) {
+ dprintk(" ERROR Expected to find"
+ " %Lu(f):%Lu(l):%Lu(s), but didn't\n",
+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
+ _2SECTS(b->bll_soff));
+ list_for_each_entry(c, &r->blr_layouts, bll_list)
+ print_bll(c, "Cached");
+ BUG();
+ }
+ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
+
+ c = list_first_entry(&r->blr_layouts,
+ struct pnfs_blocklayout_layout, bll_list);
+ if (b->bll_foff < c->bll_foff) {
+ /*
+ * Special case where new entry is before
+ * first cached entry.
+ */
+ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL);
+ list_add(&c->bll_list, &r->blr_layouts);
+ dprintk(" new entry at head of list at %Lu, "
+ "len %Lu\n",
+ _2SECTS(c->bll_foff), _2SECTS(c->bll_len));
+ } else {
+ list_for_each_entry(c, &r->blr_layouts,
+ bll_list) {
+ n = list_entry(c->bll_list.next,
+ struct pnfs_blocklayout_layout,
+ bll_list);
+ /*
+ * This is ugly, but can't think of
+ * another way to examine this case.
+ * Consider the following. Need to
+ * add an entry which starts at 40
+ * and the cache has the following
+ * entries:
+ * Start Length
+ * 10 5
+ * 30 5
+ * 50 5
+ * So, need to look and see if the new
+ * entry starts after the current
+ * cache, but before the next one.
+ * There's a catch in that the next
+ * entry might not be valid as it's
+ * really just a pointer to the list
+ * head.
+ */
+ if (((b->bll_foff >=
+ BLL_F_END(c)) &&
+ (c->bll_list.next == &r->blr_layouts)) ||
+ ((b->bll_foff >=
+ BLL_F_END(c)) &&
+ (b->bll_foff < n->bll_foff))) {
+
+ n = bll_alloc_dup(b,
+ BLOCK_LAYOUT_CACHE, NULL);
+ dprintk(" adding new %Lu:%Lu"
+ " after %Lu:%Lu\n",
+ _2SECTS(n->bll_foff),
+ _2SECTS(n->bll_len),
+ _2SECTS(c->bll_foff),
+ _2SECTS(c->bll_len));
+ list_add(&n->bll_list,
+ &c->bll_list);
+ break;
+ }
+ }
+ }
+ }
+ }
+ dprintk("<-- %s\n", __func__);
+ return status;
+}
+
+static void
+layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in)
+{
+ struct pnfs_blocklayout_layout *b,
+ *n;
+ u64 len;
+ struct nfsd4_layout_seg seg = *seg_in;
+
+ dprintk("--> %s\n", __func__);
+ if (seg.length == NFS4_MAX_UINT64) {
+ r->blr_recalled = 0;
+ dprintk(" Fast return of all layouts\n");
+ while (!list_empty(&r->blr_layouts)) {
+ b = list_entry(r->blr_layouts.next,
+ struct pnfs_blocklayout_layout, bll_list);
+ dprintk(" foff %Lu, len %Lu, soff %Lu\n",
+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
+ _2SECTS(b->bll_soff));
+ list_del(&b->bll_list);
+ kfree(b);
+ }
+ dprintk("<-- %s\n", __func__);
+ return;
+ }
+
+restart:
+ list_for_each_entry(b, &r->blr_layouts, bll_list) {
+ if (seg.offset == b->bll_foff) {
+ /*
+ * This handle the following three cases:
+ * (1) return layout matches entire cache layout
+ * (2) return layout matches beginning portion of cache
+ * (3) return layout matches entire cache layout and
+ * into next entry. Varies from #1 in end case.
+ */
+ dprintk(" match on offsets, %Lu:%Lu\n",
+ _2SECTS(seg.offset), _2SECTS(seg.length));
+ len = MIN(seg.length, b->bll_len);
+ b->bll_foff += len;
+ b->bll_soff += len;
+ b->bll_len -= len;
+ seg.length -= len;
+ seg.offset += len;
+ if (!b->bll_len) {
+ list_del(&b->bll_list);
+ kfree(b);
+ dprintk(" removing cache line\n");
+ if (!seg.length) {
+ dprintk(" also finished\n");
+ goto complete;
+ }
+ /*
+ * Since 'b' was freed we can't continue at the
+ * next entry which is referenced as
+ * b->bll_list.next by the list_for_each_entry
+ * macro. Need to restart the loop.
+ * TODO: Think about creating a dummy 'b' which
+ * would keep list_for_each_entry() happy.
+ */
+ goto restart;
+ }
+ if (!seg.length) {
+ dprintk(" finished, but cache line not"
+ "empty\n");
+ goto complete;
+ }
+ } else if ((seg.offset >= b->bll_foff) &&
+ (seg.offset < BLL_F_END(b))) {
+ /*
+ * layout being returned is within this cache line.
+ */
+ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n",
+ _2SECTS(seg.offset), _2SECTS(seg.length),
+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len));
+ BUG_ON(!seg.length);
+ if ((seg.offset + seg.length) >= BLL_F_END(b)) {
+ /*
+ * Layout returned starts in the middle of
+ * cache entry and just need to trim back
+ * cache to shorter length.
+ */
+ dprintk(" trim back cache line\n");
+ len = seg.offset - b->bll_foff;
+ seg.offset += b->bll_len - len;
+ seg.length -= b->bll_len - len;
+ b->bll_len = len;
+ if (!seg.length)
+ return;
+ } else {
+ /*
+ * Need to split current cache layout because
+ * chunk is being removed from the middle.
+ */
+ dprintk(" split cache line\n");
+ len = seg.offset + seg.length;
+ n = bll_alloc(len,
+ (b->bll_foff + b->bll_len) - len,
+ BLOCK_LAYOUT_CACHE, NULL);
+ n->bll_soff = b->bll_soff + len;
+ list_add(&n->bll_list, &b->bll_list);
+ b->bll_len = seg.offset - b->bll_foff;
+ return;
+ }
+ }
+ }
+complete:
+ if (list_empty(&r->blr_layouts))
+ r->blr_recalled = 0;
+ dprintk("<-- %s\n", __func__);
+}
+
+/*
+ * layout_cache_fill_from_list -- fills from cache list
+ *
+ * NOTE: This routine was only seperated out from layout_cache_file_from()
+ * to reduce the indentation level which makes the code easier to read.
+ */
+static inline boolean_t
+layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h,
+ struct nfsd4_layout_seg *seg)
+{
+ pnfs_blocklayout_layout_t *b,
+ *n;
+ enum pnfs_block_extent_state4 s;
+
+ list_for_each_entry(b, &r->blr_layouts, bll_list) {
+ if (seg->offset < b->bll_foff) {
+ n = bll_alloc(seg->offset,
+ MIN(seg->length, b->bll_foff - seg->offset),
+ BLOCK_LAYOUT_NEW, NULL);
+ if (!n)
+ return False;
+
+ list_add(&n->bll_list, h->prev);
+ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n",
+ _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len));
+ seg->offset += n->bll_len;
+ seg->length -= n->bll_len;
+ if (!seg->length)
+ break;
+ }
+
+ if ((seg->offset >= b->bll_foff) &&
+ (seg->offset < BLL_F_END(b))) {
+ if (layout_conflict(b, seg->iomode, &s) == False) {
+ dprintk(" CONFLICT FOUND: "
+ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n",
+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
+ _2SECTS(b->bll_soff), b->bll_es,
+ seg->iomode);
+ return False;
+ }
+ n = bll_alloc(seg->offset,
+ MIN(seg->length, BLL_F_END(b) - seg->offset),
+ BLOCK_LAYOUT_CACHE, h);
+ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): "
+ "in %Lu(f):%Lu(l):%Lu(s):%d\n",
+ _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
+ _2SECTS(b->bll_soff), b->bll_es);
+ if (!n)
+ return False;
+
+ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff;
+ n->bll_vol_id.sbid = 0;
+ n->bll_vol_id.devid = b->bll_vol_id.devid;
+ n->bll_es = s;
+ seg->offset += n->bll_len;
+ seg->length -= n->bll_len;
+ if (!seg->length)
+ break;
+ }
+ }
+ return True;
+}
+
+static u64
+bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length,
+ dev_t dev)
+{
+ pnfs_blocklayout_layout_t *n;
+
+ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates);
+ if (!n)
+ return 0;
+ n->bll_es = PNFS_BLOCK_NONE_DATA;
+ n->bll_vol_id.sbid = 0;
+ n->bll_vol_id.devid = dev;
+
+ return n->bll_len;
+}
+
+static void
+extents_setup(struct fiemap_extent_info *fei)
+{
+ fei->fi_extents_start = NULL;
+}
+
+/*
+ * extents_count -- Determine the number of extents for a given range.
+ *
+ * No need to call set_fs() here because the function
+ * doesn't use copy_to_user() if it's only counting
+ * the number of extents needed.
+ */
+static void
+extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
+{
+ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len));
+ fei->fi_flags = FIEMAP_FLAG_SYNC;
+ fei->fi_extents_max = 0;
+ fei->fi_extents_start = NULL;
+ fei->fi_extents_mapped = 0;
+ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1);
+}
+
+/*
+ * extents_get -- Get list of extents for range
+ *
+ * extents_count() must have been called before this routine such that
+ * fi_extents_mapped is known.
+ */
+static boolean_t
+extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
+{
+ int m_space,
+ rval;
+ struct fiemap_extent *fe;
+ mm_segment_t old_fs = get_fs();
+
+ /*
+ * Now malloc the correct amount of space
+ * needed. It's possible for the file to have changed
+ * between calls which would require more space for
+ * the extents. If that occurs the last extent will
+ * not have FIEMAP_EXTENT_LAST set and the error will
+ * be caught in extents_process().
+ */
+ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent);
+ fe = kmalloc(m_space, GFP_KERNEL);
+ if (!fe)
+ return False;
+ memset(fe, 0, m_space);
+
+ fei->fi_extents_max = fei->fi_extents_mapped;
+ fei->fi_extents_mapped = 0;
+ fei->fi_extents_start = fe;
+
+ set_fs(KERNEL_DS);
+ rval = i->i_op->fiemap(i, fei, foff, len +
+ (1 << i->i_sb->s_blocksize_bits) - 1);
+ set_fs(old_fs);
+
+ if (rval || !fei->fi_extents_mapped) {
+ dprintk(" No extents. Wanted %d, got %d\n",
+ fei->fi_extents_max, fei->fi_extents_mapped);
+ kfree(fe);
+ fei->fi_extents_start = NULL;
+ return False;
+ } else
+ return True;
+}
+
+/*
+ * extents_process -- runs through the extent returned from the file system and
+ * creates block layout entries.
+ */
+static boolean_t
+extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates,
+ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b)
+{
+ struct fiemap_extent *fep,
+ *fep_last = NULL;
+ int i;
+ pnfs_blocklayout_layout_t *n;
+ u64 last_end,
+ rval;
+
+ dprintk("--> %s\n", __func__);
+ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped;
+ i++, fep++) {
+
+ BUG_ON(!fep->fe_physical);
+ /*
+ * Deal with corner cases of hoel-y files.
+ */
+ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) !=
+ fep->fe_logical)) {
+
+ /*
+ * If the last extent doesn't end logically
+ * at the beginning of the current we've got
+ * hole and need to create a pNFS extent.
+ */
+ dprintk(" Got a hole at %Ld:%Ld \n",
+ _2SECTS(fep_last->fe_logical),
+ _2SECTS(fep_last->fe_length));
+ last_end = fep_last->fe_logical + fep_last->fe_length;
+ rval = bll_alloc_holey(bl_candidates, last_end,
+ fep->fe_logical - last_end, dev);
+ if (!rval)
+ return False;
+ seg->length += rval;
+ }
+
+ n = bll_alloc(fep->fe_logical, fep->fe_length,
+ BLOCK_LAYOUT_NEW, bl_candidates);
+ if (unlikely(n == NULL)) {
+ dprintk("%s: bll_alloc failed\n", __func__);
+ return False;
+ }
+
+ n->bll_soff = fep->fe_physical;
+ n->bll_es = seg->iomode == IOMODE_READ ?
+ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA;
+ n->bll_vol_id.sbid = 0;
+ n->bll_vol_id.devid = dev;
+ seg->length += fep->fe_length;
+ print_bll(n, "New extent");
+ fep_last = fep;
+ }
+ dprintk("<-- %s (i=%d)\n", __func__, i);
+
+ return True;
+}
+
+static void
+extents_cleanup(struct fiemap_extent_info *fei)
+{
+ if (fei->fi_extents_start) {
+ kfree(fei->fi_extents_start);
+ fei->fi_extents_start = NULL;
+ }
+}
+
+/*
+ * device_slice -- check to see if device is a slice or DM
+ */
+static boolean_t
+device_slice(dev_t devid)
+{
+ struct block_device *bd = blkdev_get_by_dev(devid, FMODE_READ, NULL);
+ boolean_t rval = False;
+
+ if (bd) {
+ if (bd->bd_disk->minors > 1)
+ rval = True;
+ blkdev_put(bd, FMODE_READ);
+ }
+ return rval;
+}
+
+/*
+ * device_dm -- check to see if device is a Device Mapper volume.
+ *
+ * Returns 1 for DM or 0 if not
+ */
+static boolean_t
+device_dm(dev_t devid)
+{
+ boolean_t rval = False;
+ bl_comm_msg_t msg;
+ bl_comm_res_t *res;
+
+ msg.msg_type = PNFS_UPCALL_MSG_DMCHK;
+ msg.u.msg_dev = devid;
+ if (bl_upcall(bl_comm_global, &msg, &res)) {
+ dprintk("Failed upcall to check on DM status\n");
+ } else if (res->u.dm_vol) {
+ rval = True;
+ dprintk("Device is DM volume\n");
+ } else
+ dprintk("Device is not DM volume\n");
+ kfree(res);
+
+ return rval;
+}
+
+static boolean_t
+layout_inode_add(struct inode *i, bl_layout_rec_t **p)
+{
+ bl_layout_rec_t *r = NULL;
+
+ if (!i->i_op->fiemap || !i->i_fop->fallocate) {
+ printk("pNFS: file system doesn't support required fiemap or"
+ "fallocate methods\n");
+ return False;
+ }
+
+ r = kmalloc(sizeof (*r), GFP_KERNEL);
+ if (!r)
+ goto error;
+
+ r->blr_rdev = i->i_sb->s_dev;
+ r->blr_inode = i;
+ r->blr_orig_size = i->i_size;
+ r->blr_ext_size = 0;
+ r->blr_recalled = 0;
+ INIT_LIST_HEAD(&r->blr_layouts);
+ spin_lock_init(&r->blr_lock);
+ spin_lock(&layout_hashtbl_lock);
+ list_add_tail(&r->blr_hash, &layout_hash);
+ spin_unlock(&layout_hashtbl_lock);
+ *p = r;
+ return True;
+
+error:
+ if (r)
+ kfree(r);
+ return False;
+}
+
+static bl_layout_rec_t *
+__layout_inode_find(struct inode *i)
+{
+ bl_layout_rec_t *r;
+
+ if (!list_empty(&layout_hash)) {
+ list_for_each_entry(r, &layout_hash, blr_hash) {
+ if ((r->blr_inode->i_ino == i->i_ino) &&
+ (r->blr_rdev == i->i_sb->s_dev)) {
+ return r;
+ }
+ }
+ }
+ return NULL;
+}
+
+static bl_layout_rec_t *
+layout_inode_find(struct inode *i)
+{
+ bl_layout_rec_t *r;
+
+ spin_lock(&layout_hashtbl_lock);
+ r = __layout_inode_find(i);
+ spin_unlock(&layout_hashtbl_lock);
+
+ return r;
+}
+
+static void
+layout_inode_del(struct inode *i)
+{
+ bl_layout_rec_t *r;
+
+ spin_lock(&layout_hashtbl_lock);
+ r = __layout_inode_find(i);
+ if (r) {
+ spin_lock(&r->blr_lock);
+ if (list_empty(&r->blr_layouts)) {
+ list_del(&r->blr_hash);
+ spin_unlock(&r->blr_lock);
+ kfree(r);
+ } else {
+ spin_unlock(&r->blr_lock);
+ }
+ } else {
+ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n",
+ __func__, i->i_sb->s_dev, i->i_ino);
+ }
+ spin_unlock(&layout_hashtbl_lock);
+}
+
+/*
+ * map_state2name -- converts state in ascii string.
+ *
+ * Used for debug messages only.
+ */
+static char *
+map_state2name(enum pnfs_block_extent_state4 s)
+{
+ switch (s) {
+ case PNFS_BLOCK_READWRITE_DATA: return " RW";
+ case PNFS_BLOCK_READ_DATA: return " RO";
+ case PNFS_BLOCK_INVALID_DATA: return "INVALID";
+ case PNFS_BLOCK_NONE_DATA: return " NONE";
+ default:
+ BUG();
+ }
+}
+
+static pnfs_blocklayout_devinfo_t *
+bld_alloc(struct list_head *volumes, int type)
+{
+ pnfs_blocklayout_devinfo_t *bld;
+
+ bld = kmalloc(sizeof (*bld), GFP_KERNEL);
+ if (!bld)
+ return NULL;
+
+ memset(bld, 0, sizeof (*bld));
+ bld->bld_type = type;
+ list_add_tail(&bld->bld_list, volumes);
+
+ return bld;
+}
+
+static void
+bld_free(pnfs_blocklayout_devinfo_t *bld)
+{
+ list_del(&bld->bld_list);
+ kfree(bld);
+}
+
+static void
+print_bll(pnfs_blocklayout_layout_t *b, char *text)
+{
+ dprintk(" BLL: %s\n", text);
+ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n",
+ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len),
+ map_state2name(b->bll_es));
+}
+
+static inline void
+bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c)
+{
+ pnfs_blocklayout_layout_t *n;
+ int dbg_count = 0;
+ u64 endpoint;
+
+ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA);
+ while (c->bll_list.next != &r->blr_layouts) {
+ n = list_entry(c->bll_list.next,
+ struct pnfs_blocklayout_layout, bll_list);
+ endpoint = BLL_S_END(c);
+ if ((n->bll_soff >= c->bll_soff) &&
+ (n->bll_soff < endpoint)) {
+ if (endpoint < BLL_S_END(n)) {
+ /*
+ * The following is possible.
+ *
+ *
+ * Existing: +---+ +---+
+ * New: +-----------------------+
+ * The client request merge entries together
+ * but didn't require picking up all of the
+ * last entry. So, we still need to delete
+ * the last entry and add the remaining space
+ * to the new entry.
+ */
+ c->bll_len += BLL_S_END(n) - endpoint;
+ }
+ dbg_count++;
+ list_del(&n->bll_list);
+ kfree(n);
+ } else {
+ break;
+ }
+ }
+ /* ---- Debug only, remove before integration ---- */
+ if (dbg_count)
+ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n",
+ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c)));
+}
+
+static pnfs_blocklayout_layout_t *
+bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h)
+{
+ pnfs_blocklayout_layout_t *n = NULL;
+
+ n = kmalloc(sizeof (*n), GFP_KERNEL);
+ if (n) {
+ memset(n, 0, sizeof (*n));
+ n->bll_foff = offset;
+ n->bll_len = len;
+ n->bll_cache_state = state;
+ if (h)
+ list_add_tail(&n->bll_list, h);
+ }
+ return n;
+}
+
+static pnfs_blocklayout_layout_t *
+bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c,
+ struct list_head *h)
+{
+ pnfs_blocklayout_layout_t *n = NULL;
+
+ n = bll_alloc(b->bll_foff, b->bll_len, c, h);
+ if (n) {
+ n->bll_es = b->bll_es;
+ n->bll_soff = b->bll_soff;
+ n->bll_vol_id.devid = b->bll_vol_id.devid;
+ }
+ return n;
+}
+
+static inline boolean_t
+layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
+ enum pnfs_block_extent_state4 *s)
+{
+ /* ---- Normal case ---- */
+ *s = b->bll_es;
+
+ switch (b->bll_es) {
+ case PNFS_BLOCK_READWRITE_DATA:
+ if (iomode == IOMODE_READ)
+ *s = PNFS_BLOCK_READ_DATA;
+ /* ---- Any use is permitted. ---- */
+ break;
+ case PNFS_BLOCK_READ_DATA:
+ /* ---- Committed as read only data. ---- */
+ if (iomode == IOMODE_RW)
+ return False;
+ break;
+ case PNFS_BLOCK_INVALID_DATA:
+ /* ---- Blocks have been allocated, but not initialized ---- */
+ if (iomode == IOMODE_READ)
+ *s = PNFS_BLOCK_NONE_DATA;
+ break;
+ case PNFS_BLOCK_NONE_DATA:
+ /* ---- Hole-y file. No backing store avail. ---- */
+ if (iomode != IOMODE_READ)
+ return False;
+ break;
+ default:
+ BUG();
+ }
+ return True;
+}
+
+#endif /* CONFIG_SPNFS_BLOCK */
diff -up linux-2.6.38.noarch/fs/nfsd/export.c.orig linux-2.6.38.noarch/fs/nfsd/export.c
--- linux-2.6.38.noarch/fs/nfsd/export.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/export.c 2011-03-26 07:57:44.282821243 -0400
@@ -16,11 +16,19 @@
#include <linux/module.h>
#include <linux/exportfs.h>
+#include <linux/nfsd/nfsd4_pnfs.h>
+#if defined(CONFIG_SPNFS)
+#include <linux/nfsd4_spnfs.h>
+#if defined(CONFIG_SPNFS_BLOCK)
+#include <linux/nfsd4_block.h>
+#endif
+#endif
#include <linux/nfsd/syscall.h>
#include <net/ipv6.h>
#include "nfsd.h"
#include "nfsfh.h"
+#include "pnfsd.h"
#define NFSDDBG_FACILITY NFSDDBG_EXPORT
@@ -348,10 +356,84 @@ static int svc_export_upcall(struct cach
return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
}
+#if defined(CONFIG_PNFSD)
+static struct pnfsd_cb_operations pnfsd_cb_op = {
+ .cb_layout_recall = nfsd_layout_recall_cb,
+ .cb_device_notify = nfsd_device_notify_cb,
+
+ .cb_get_state = nfs4_pnfs_cb_get_state,
+ .cb_change_state = nfs4_pnfs_cb_change_state,
+};
+
+#if defined(CONFIG_SPNFS)
+static struct pnfs_export_operations spnfs_export_ops = {
+ .layout_type = spnfs_layout_type,
+ .get_device_info = spnfs_getdeviceinfo,
+ .get_device_iter = spnfs_getdeviceiter,
+ .layout_get = spnfs_layoutget,
+ .layout_return = spnfs_layoutreturn,
+};
+
+static struct pnfs_export_operations spnfs_ds_export_ops = {
+ .get_state = spnfs_get_state,
+};
+
+#if defined(CONFIG_SPNFS_BLOCK)
+static struct pnfs_export_operations bl_export_ops = {
+ .layout_type = bl_layout_type,
+ .get_device_info = bl_getdeviceinfo,
+ .get_device_iter = bl_getdeviceiter,
+ .layout_get = bl_layoutget,
+ .layout_return = bl_layoutreturn,
+};
+#endif /* CONFIG_SPNFS_BLOCK */
+#endif /* CONFIG_SPNFS */
+#endif /* CONFIG_PNFSD */
+
static struct svc_export *svc_export_update(struct svc_export *new,
struct svc_export *old);
static struct svc_export *svc_export_lookup(struct svc_export *);
+static int pnfsd_check_export(struct inode *inode, int *flags)
+{
+#if defined(CONFIG_PNFSD)
+
+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
+ if (!inode->i_sb->s_pnfs_op)
+ pnfsd_lexp_init(inode);
+ return 0;
+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
+
+#if defined(CONFIG_SPNFS)
+#if defined(CONFIG_SPNFS_BLOCK)
+ if (pnfs_block_enabled(inode, *flags)) {
+ dprintk("set pnfs block export structure... \n");
+ inode->i_sb->s_pnfs_op = &bl_export_ops;
+ } else
+#endif /* CONFIG_SPNFS_BLOCK */
+ /*
+ * spnfs_enabled() indicates we're an MDS.
+ * XXX Better to check an export time option as well.
+ */
+ if (spnfs_enabled()) {
+ dprintk("set spnfs export structure...\n");
+ inode->i_sb->s_pnfs_op = &spnfs_export_ops;
+ } else {
+ dprintk("%s spnfs not in use\n", __func__);
+
+ /*
+ * get_state is needed if we're a DS using spnfs.
+ * XXX Better to check an export time option instead.
+ */
+ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops;
+ }
+#endif /* CONFIG_SPNFS */
+
+#endif /* CONFIG_PNFSD */
+
+ return 0;
+}
+
static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
{
@@ -391,8 +473,17 @@ static int check_export(struct inode *in
return -EINVAL;
}
- return 0;
+#if !defined(CONFIG_SPNFS)
+ if (inode->i_sb->s_pnfs_op &&
+ (!inode->i_sb->s_pnfs_op->layout_type ||
+ !inode->i_sb->s_pnfs_op->get_device_info ||
+ !inode->i_sb->s_pnfs_op->layout_get)) {
+ dprintk("exp_export: export of invalid fs pnfs export ops.\n");
+ return -EINVAL;
+ }
+#endif /* !CONFIG_SPNFS */
+ return pnfsd_check_export(inode, flags);
}
#ifdef CONFIG_NFSD_V4
@@ -582,6 +673,8 @@ static int svc_export_parse(struct cache
if (exp.ex_uuid == NULL)
err = -ENOMEM;
}
+ } else if (strcmp(buf, "pnfs") == 0) {
+ exp.ex_pnfs = 1;
} else if (strcmp(buf, "secinfo") == 0)
err = secinfo_parse(&mesg, buf, &exp);
else
@@ -656,6 +749,8 @@ static int svc_export_show(struct seq_fi
seq_printf(m, "%02x", exp->ex_uuid[i]);
}
}
+ if (exp->ex_pnfs)
+ seq_puts(m, ",pnfs");
show_secinfo(m, exp);
}
seq_puts(m, ")\n");
@@ -683,6 +778,7 @@ static void svc_export_init(struct cache
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
+ new->ex_pnfs = 0;
}
static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -695,6 +791,7 @@ static void export_update(struct cache_h
new->ex_anon_uid = item->ex_anon_uid;
new->ex_anon_gid = item->ex_anon_gid;
new->ex_fsid = item->ex_fsid;
+ new->ex_pnfs = item->ex_pnfs;
new->ex_uuid = item->ex_uuid;
item->ex_uuid = NULL;
new->ex_pathname = item->ex_pathname;
@@ -1662,8 +1759,17 @@ nfsd_export_init(void)
if (rv)
return rv;
rv = cache_register(&svc_expkey_cache);
- if (rv)
+ if (rv) {
cache_unregister(&svc_export_cache);
+ goto out;
+ }
+#if defined(CONFIG_PNFSD)
+ spin_lock(&pnfsd_cb_ctl.lock);
+ pnfsd_cb_ctl.module = THIS_MODULE;
+ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op;
+ spin_unlock(&pnfsd_cb_ctl.lock);
+#endif /* CONFIG_PNFSD */
+out:
return rv;
}
@@ -1691,6 +1797,12 @@ nfsd_export_shutdown(void)
exp_writelock();
+#if defined(CONFIG_PNFSD)
+ spin_lock(&pnfsd_cb_ctl.lock);
+ pnfsd_cb_ctl.module = NULL;
+ pnfsd_cb_ctl.cb_op = NULL;
+ spin_unlock(&pnfsd_cb_ctl.lock);
+#endif /* CONFIG_PNFSD */
cache_unregister(&svc_expkey_cache);
cache_unregister(&svc_export_cache);
svcauth_unix_purge();
diff -up linux-2.6.38.noarch/fs/nfs/dir.c.orig linux-2.6.38.noarch/fs/nfs/dir.c
--- linux-2.6.38.noarch/fs/nfs/dir.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/dir.c 2011-03-26 07:57:44.245821557 -0400
@@ -1161,19 +1161,22 @@ static void nfs_dentry_iput(struct dentr
if (S_ISDIR(inode->i_mode))
/* drop any readdir cache as it could easily be old */
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
-
- if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
drop_nlink(inode);
- nfs_complete_unlink(dentry, inode);
- }
iput(inode);
}
+static void nfs_d_unlink(struct dentry *parent, struct dentry *dentry)
+{
+ nfs_complete_unlink(parent, dentry);
+}
+
const struct dentry_operations nfs_dentry_operations = {
.d_revalidate = nfs_lookup_revalidate,
.d_delete = nfs_dentry_delete,
.d_iput = nfs_dentry_iput,
.d_automount = nfs_d_automount,
+ .d_unlink = nfs_d_unlink,
};
static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1248,6 +1251,7 @@ const struct dentry_operations nfs4_dent
.d_delete = nfs_dentry_delete,
.d_iput = nfs_dentry_iput,
.d_automount = nfs_d_automount,
+ .d_unlink = nfs_d_unlink,
};
/*
diff -up linux-2.6.38.noarch/fs/nfs/direct.c.orig linux-2.6.38.noarch/fs/nfs/direct.c
--- linux-2.6.38.noarch/fs/nfs/direct.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/direct.c 2011-03-26 07:57:44.246821549 -0400
@@ -271,6 +271,38 @@ static const struct rpc_call_ops nfs_rea
.rpc_release = nfs_direct_read_release,
};
+static long nfs_direct_read_execute(struct nfs_read_data *data,
+ struct rpc_task_setup *task_setup_data,
+ struct rpc_message *msg)
+{
+ struct inode *inode = data->inode;
+ struct rpc_task *task;
+
+ nfs_fattr_init(&data->fattr);
+ msg->rpc_argp = &data->args;
+ msg->rpc_resp = &data->res;
+
+ task_setup_data->task = &data->task;
+ task_setup_data->callback_data = data;
+ NFS_PROTO(inode)->read_setup(data, msg);
+
+ task = rpc_run_task(task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ rpc_put_task(task);
+
+ dprintk("NFS: %5u initiated direct read call "
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ return 0;
+}
+
/*
* For each rsize'd chunk of the user's buffer, dispatch an NFS READ
* operation. If nfs_readdata_alloc() or get_user_pages() fails,
@@ -287,7 +319,6 @@ static ssize_t nfs_direct_read_schedule_
unsigned long user_addr = (unsigned long)iov->iov_base;
size_t count = iov->iov_len;
size_t rsize = NFS_SERVER(inode)->rsize;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_cred = ctx->cred,
};
@@ -348,26 +379,9 @@ static ssize_t nfs_direct_read_schedule_
data->res.fattr = &data->fattr;
data->res.eof = 0;
data->res.count = bytes;
- nfs_fattr_init(&data->fattr);
- msg.rpc_argp = &data->args;
- msg.rpc_resp = &data->res;
- task_setup_data.task = &data->task;
- task_setup_data.callback_data = data;
- NFS_PROTO(inode)->read_setup(data, &msg);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- break;
- rpc_put_task(task);
-
- dprintk("NFS: %5u initiated direct read call "
- "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- bytes,
- (unsigned long long)data->args.offset);
+ if (nfs_direct_read_execute(data, &task_setup_data, &msg))
+ break;
started += bytes;
user_addr += bytes;
@@ -460,12 +474,15 @@ static void nfs_direct_free_writedata(st
}
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static long nfs_direct_write_execute(struct nfs_write_data *data,
+ struct rpc_task_setup *task_setup_data,
+ struct rpc_message *msg);
+
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct inode *inode = dreq->inode;
struct list_head *p;
struct nfs_write_data *data;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_cred = dreq->ctx->cred,
};
@@ -499,25 +516,7 @@ static void nfs_direct_write_reschedule(
* Reuse data->task; data->args should not have changed
* since the original request was sent.
*/
- task_setup_data.task = &data->task;
- task_setup_data.callback_data = data;
- msg.rpc_argp = &data->args;
- msg.rpc_resp = &data->res;
- NFS_PROTO(inode)->write_setup(data, &msg);
-
- /*
- * We're called via an RPC callback, so BKL is already held.
- */
- task = rpc_run_task(&task_setup_data);
- if (!IS_ERR(task))
- rpc_put_task(task);
-
- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ nfs_direct_write_execute(data, &task_setup_data, &msg);
}
if (put_dreq(dreq))
@@ -560,10 +559,31 @@ static const struct rpc_call_ops nfs_com
.rpc_release = nfs_direct_commit_release,
};
+static long nfs_direct_commit_execute(struct nfs_direct_req *dreq,
+ struct nfs_write_data *data,
+ struct rpc_task_setup *task_setup_data,
+ struct rpc_message *msg)
+{
+ struct rpc_task *task;
+
+ NFS_PROTO(data->inode)->commit_setup(data, msg);
+
+ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
+ dreq->commit_data = NULL;
+
+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+ task = rpc_run_task(task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ rpc_put_task(task);
+ return 0;
+}
+
static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{
struct nfs_write_data *data = dreq->commit_data;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
@@ -592,16 +612,7 @@ static void nfs_direct_commit_schedule(s
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);
- NFS_PROTO(data->inode)->commit_setup(data, &msg);
-
- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
- dreq->commit_data = NULL;
-
- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
-
- task = rpc_run_task(&task_setup_data);
- if (!IS_ERR(task))
- rpc_put_task(task);
+ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg);
}
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
@@ -703,6 +714,36 @@ static const struct rpc_call_ops nfs_wri
.rpc_release = nfs_direct_write_release,
};
+static long nfs_direct_write_execute(struct nfs_write_data *data,
+ struct rpc_task_setup *task_setup_data,
+ struct rpc_message *msg)
+{
+ struct inode *inode = data->inode;
+ struct rpc_task *task;
+
+ task_setup_data->task = &data->task;
+ task_setup_data->callback_data = data;
+ msg->rpc_argp = &data->args;
+ msg->rpc_resp = &data->res;
+ NFS_PROTO(inode)->write_setup(data, msg);
+
+ task = rpc_run_task(task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ rpc_put_task(task);
+
+ dprintk("NFS: %5u initiated direct write call "
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ return 0;
+}
+
/*
* For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
* operation. If nfs_writedata_alloc() or get_user_pages() fails,
@@ -718,7 +759,6 @@ static ssize_t nfs_direct_write_schedule
struct inode *inode = ctx->path.dentry->d_inode;
unsigned long user_addr = (unsigned long)iov->iov_base;
size_t count = iov->iov_len;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_cred = ctx->cred,
};
@@ -785,24 +825,8 @@ static ssize_t nfs_direct_write_schedule
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);
- task_setup_data.task = &data->task;
- task_setup_data.callback_data = data;
- msg.rpc_argp = &data->args;
- msg.rpc_resp = &data->res;
- NFS_PROTO(inode)->write_setup(data, &msg);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- break;
- rpc_put_task(task);
-
- dprintk("NFS: %5u initiated direct write call "
- "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- bytes,
- (unsigned long long)data->args.offset);
+ if (nfs_direct_write_execute(data, &task_setup_data, &msg))
+ break;
started += bytes;
user_addr += bytes;
diff -up linux-2.6.38.noarch/fs/nfsd/Kconfig.orig linux-2.6.38.noarch/fs/nfsd/Kconfig
--- linux-2.6.38.noarch/fs/nfsd/Kconfig.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/Kconfig 2011-03-26 07:57:44.278821276 -0400
@@ -91,3 +91,52 @@ config NFSD_V4
available from http://linux-nfs.org/.
If unsure, say N.
+
+config PNFSD
+ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)"
+ depends on NFSD_V4 && EXPERIMENTAL
+ select EXPORTFS_FILE_LAYOUT
+ help
+ This option enables support for the parallel NFS features of the
+ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1)
+ in the kernel's NFS server.
+
+ Unless you're an NFS developer, say N.
+
+config PNFSD_LOCAL_EXPORT
+ bool "Enable pNFS support for exporting local filesystems for debugging purposes"
+ depends on PNFSD
+ help
+ Say Y here if you want your pNFS server to export local file systems
+ over the files layout type. With this option the MDS (metadata
+ server) functions also as a single DS (data server). This is mostly
+ useful for development and debugging purposes.
+
+ If unsure, say N.
+
+config SPNFS
+ bool "Provide spNFS server support (EXPERIMENTAL)"
+ depends on PNFSD
+ select RPCSEC_GSS_KRB5
+ help
+ Say Y here if you want spNFS server support.
+
+ If unsure, say N.
+
+config SPNFS_LAYOUTSEGMENTS
+ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)"
+ depends on SPNFS
+ select RPCSEC_GSS_KRB5
+ help
+ Say Y here if you want spNFS to be able to return layout segments.
+
+ If unsure, say N.
+
+config SPNFS_BLOCK
+ bool "Provide Block Layout server support (EXPERIMENTAL)"
+ depends on SPNFS
+ select EXPORTFS_BLOCK_LAYOUT
+ help
+ Say Y here if you want spNFS block layout support
+
+ If unsure, say N.
diff -up linux-2.6.38.noarch/fs/nfsd/Makefile.orig linux-2.6.38.noarch/fs/nfsd/Makefile
--- linux-2.6.38.noarch/fs/nfsd/Makefile.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/Makefile 2011-03-26 07:57:44.279821268 -0400
@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o
+nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o
+nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o
+nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o
diff -up linux-2.6.38.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4callback.c
--- linux-2.6.38.noarch/fs/nfsd/nfs4callback.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfs4callback.c 2011-03-26 07:57:44.284821225 -0400
@@ -39,6 +39,8 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
@@ -48,6 +50,8 @@ enum {
NFSPROC4_CLNT_CB_NULL = 0,
NFSPROC4_CLNT_CB_RECALL,
NFSPROC4_CLNT_CB_SEQUENCE,
+ NFSPROC4_CLNT_CB_LAYOUT,
+ NFSPROC4_CLNT_CB_DEVICE,
};
#define NFS4_MAXTAGLEN 20
@@ -73,6 +77,19 @@ enum {
#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + 3 + \
+ enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
+#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + 6)
+#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
struct nfs4_cb_compound_hdr {
/* args */
@@ -361,6 +378,151 @@ static void encode_cb_recall4args(struct
hdr->nops++;
}
+#if defined(CONFIG_PNFSD)
+
+#include "pnfsd.h"
+
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ * struct layoutrecall_file4 {
+ * nfs_fh4 lor_fh;
+ * offset4 lor_offset;
+ * length4 lor_length;
+ * stateid4 lor_stateid;
+ * };
+ *
+ * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ * case LAYOUTRECALL4_FILE:
+ * layoutrecall_file4 lor_layout;
+ * case LAYOUTRECALL4_FSID:
+ * fsid4 lor_fsid;
+ * case LAYOUTRECALL4_ALL:
+ * void;
+ * };
+ *
+ * struct CB_LAYOUTRECALL4args {
+ * layouttype4 clora_type;
+ * layoutiomode4 clora_iomode;
+ * bool clora_changed;
+ * layoutrecall4 clora_recall;
+ * };
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+ const struct nfs4_layoutrecall *clr,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ u32 *p;
+
+ BUG_ON(hdr->minorversion == 0);
+
+ p = xdr_reserve_space(xdr, 5 * 4);
+ *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+ *p++ = cpu_to_be32(clr->cb.cbl_seg.layout_type);
+ *p++ = cpu_to_be32(clr->cb.cbl_seg.iomode);
+ *p++ = cpu_to_be32(clr->cb.cbl_layoutchanged);
+ *p = cpu_to_be32(clr->cb.cbl_recall_type);
+ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) {
+ struct nfs4_fsid fsid = clr->cb.cbl_fsid;
+
+ p = xdr_reserve_space(xdr, 2 * 8);
+ p = xdr_encode_hyper(p, fsid.major);
+ xdr_encode_hyper(p, fsid.minor);
+ dprintk("%s: type %x iomode %d changed %d recall_type %d "
+ "fsid 0x%llx-0x%llx\n",
+ __func__, clr->cb.cbl_seg.layout_type,
+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
+ clr->cb.cbl_recall_type, fsid.major, fsid.minor);
+ } else if (clr->cb.cbl_recall_type == RETURN_FILE) {
+ int len = clr->clr_file->fi_fhlen;
+ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid;
+
+ p = xdr_reserve_space(xdr, 4 + len + 2 * 8);
+ *p++ = cpu_to_be32(len);
+ xdr_encode_opaque_fixed(p, clr->clr_file->fi_fhval, len);
+ p += XDR_QUADLEN(len);
+ p = xdr_encode_hyper(p, clr->cb.cbl_seg.offset);
+ xdr_encode_hyper(p, clr->cb.cbl_seg.length);
+ encode_stateid4(xdr, cbl_sid);
+ dprintk("%s: type %x iomode %d changed %d recall_type %d "
+ "offset %lld length %lld stateid " STATEID_FMT "\n",
+ __func__, clr->cb.cbl_seg.layout_type,
+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
+ clr->cb.cbl_recall_type,
+ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length,
+ STATEID_VAL(cbl_sid));
+ } else {
+ dprintk("%s: type %x iomode %d changed %d recall_type %d\n",
+ __func__, clr->cb.cbl_seg.layout_type,
+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
+ clr->cb.cbl_recall_type);
+ }
+ hdr->nops++;
+}
+
+/*
+ * CB_NOTIFY_DEVICEID4args
+ *
+ * typedef opaque notifylist4<>;
+ *
+ * struct notify4 {
+ * bitmap4 notify_mask;
+ * notifylist4 notify_vals;
+ * };
+ *
+ * struct CB_NOTIFY_DEVICEID4args {
+ * notify4 cnda_changes<>;
+ * };
+ */
+static void encode_cb_device4args(struct xdr_stream *xdr,
+ const struct nfs4_notify_device *nd,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ u32 *p;
+ int i;
+ int len = nd->nd_list->cbd_len;
+ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list;
+
+ dprintk("NFSD %s: --> num %d\n", __func__, len);
+
+ BUG_ON(hdr->minorversion == 0);
+
+ p = xdr_reserve_space(xdr, 2 * 4);
+ *p++ = cpu_to_be32(OP_CB_NOTIFY_DEVICEID);
+ /* notify4 cnda_changes<>; */
+ *p = cpu_to_be32(len);
+ for (i = 0; i < len; i++) {
+ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n",
+ __func__, cbd[i].cbd_notify_type,
+ cbd[i].cbd_layout_type,
+ cbd[i].cbd_devid.sbid,
+ cbd[i].cbd_devid.devid,
+ cbd[i].cbd_immediate, i);
+
+ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE);
+ p = xdr_reserve_space(xdr, 4 * 4 + 2 * 8);
+ /* bitmap4 notify_mask; */
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(cbd[i].cbd_notify_type);
+ /* opaque notify_vals<>; */
+ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+ *p++ = cpu_to_be32(24);
+ else
+ *p++ = cpu_to_be32(20);
+ *p++ = cpu_to_be32(cbd[i].cbd_layout_type);
+ p = xdr_encode_hyper(p, cbd[i].cbd_devid.sbid);
+ xdr_encode_hyper(p, cbd[i].cbd_devid.devid);
+
+ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) {
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(cbd[i].cbd_immediate);
+ }
+ }
+ hdr->nops++;
+}
+#endif /* CONFIG_PNFSD */
+
/*
* CB_SEQUENCE4args
*
@@ -460,6 +622,8 @@ static int decode_cb_sequence4resok(stru
*/
status = 0;
out:
+ if (status)
+ nfsd4_mark_cb_fault(cb->cb_clp, status);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -523,6 +687,39 @@ static void nfs4_xdr_enc_cb_recall(struc
encode_cb_nops(&hdr);
}
+#if defined(CONFIG_PNFSD)
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfsd4_callback *cb)
+{
+ const struct nfs4_layoutrecall *args = cb->cb_op;
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_layout4args(xdr, args, &hdr);
+ encode_cb_nops(&hdr);
+}
+
+static void nfs4_xdr_enc_cb_device(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfsd4_callback *cb)
+{
+ struct nfs4_notify_device *args = cb->cb_op;
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_device4args(xdr, args, &hdr);
+ encode_cb_nops(&hdr);
+}
+#endif /* CONFIG_PNFSD */
/*
* NFSv4.0 and NFSv4.1 XDR decode functions
@@ -569,6 +766,58 @@ out:
return status;
}
+#if defined(CONFIG_PNFSD)
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfsd4_callback *cb)
+{
+ struct nfs4_cb_compound_hdr hdr;
+ enum nfsstat4 nfserr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ goto out;
+ if (cb) {
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status))
+ goto out;
+ }
+ status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
+ if (unlikely(status))
+ goto out;
+ if (unlikely(nfserr != NFS4_OK))
+ status = nfs_cb_stat_to_errno(nfserr);
+out:
+ return status;
+}
+
+static int nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfsd4_callback *cb)
+{
+ struct nfs4_cb_compound_hdr hdr;
+ enum nfsstat4 nfserr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ goto out;
+ if (cb) {
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status))
+ goto out;
+ }
+ status = decode_cb_op_status(xdr, OP_CB_NOTIFY_DEVICEID, &nfserr);
+ if (unlikely(status))
+ goto out;
+ if (unlikely(nfserr != NFS4_OK))
+ status = nfs_cb_stat_to_errno(nfserr);
+out:
+ return status;
+}
+#endif /* CONFIG_PNFSD */
+
/*
* RPC procedure tables
*/
@@ -586,6 +835,10 @@ out:
static struct rpc_procinfo nfs4_cb_procedures[] = {
PROC(CB_NULL, NULL, cb_null, cb_null),
PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
+#if defined(CONFIG_PNFSD)
+ PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
+ PROC(CB_DEVICE, COMPOUND, cb_device, cb_device),
+#endif
};
static struct rpc_version nfs_cb_version4 = {
@@ -686,6 +939,12 @@ static void nfsd4_mark_cb_down(struct nf
warn_no_callback_path(clp, reason);
}
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+ clp->cl_cb_state = NFSD4_CB_FAULT;
+ warn_no_callback_path(clp, reason);
+}
+
static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
{
struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
@@ -783,11 +1042,10 @@ static bool nfsd41_cb_get_slot(struct nf
* TODO: cb_sequence should support referring call lists, cachethis, multiple
* slots, and mark callback channel down on communication errors.
*/
-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+static void nfsd4_cb_prepare_sequence(struct rpc_task *task,
+ struct nfsd4_callback *cb,
+ struct nfs4_client *clp)
{
- struct nfsd4_callback *cb = calldata;
- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
u32 minorversion = clp->cl_minorversion;
cb->cb_minorversion = minorversion;
@@ -805,12 +1063,17 @@ static void nfsd4_cb_prepare(struct rpc_
rpc_call_start(task);
}
-static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ nfsd4_cb_prepare_sequence(task, cb, dp->dl_client);
+}
+
+static void nfsd4_cb_done_sequence(struct rpc_task *task,
+ struct nfs4_client *clp)
+{
dprintk("%s: minorversion=%d\n", __func__,
clp->cl_minorversion);
@@ -821,9 +1084,6 @@ static void nfsd4_cb_done(struct rpc_tas
rpc_wake_up_next(&clp->cl_cb_waitq);
dprintk("%s: freed slot, new seqid=%d\n", __func__,
clp->cl_cb_session->se_cb_seq_nr);
-
- /* We're done looking into the sequence information */
- task->tk_msg.rpc_resp = NULL;
}
}
@@ -835,7 +1095,7 @@ static void nfsd4_cb_recall_done(struct
struct nfs4_client *clp = dp->dl_client;
struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
- nfsd4_cb_done(task, calldata);
+ nfsd4_cb_done_sequence(task, clp);
if (current_rpc_client != task->tk_client) {
/* We're shutting down or changing cl_cb_client; leave
@@ -884,7 +1144,7 @@ static void nfsd4_cb_recall_release(void
}
static const struct rpc_call_ops nfsd4_cb_recall_ops = {
- .rpc_call_prepare = nfsd4_cb_prepare,
+ .rpc_call_prepare = nfsd4_cb_recall_prepare,
.rpc_call_done = nfsd4_cb_recall_done,
.rpc_release = nfsd4_cb_recall_release,
};
@@ -1024,3 +1284,188 @@ void nfsd4_cb_recall(struct nfs4_delegat
run_nfsd4_cb(&dp->dl_recall);
}
+
+#if defined(CONFIG_PNFSD)
+static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall);
+
+ nfsd4_cb_prepare_sequence(task, cb, clr->clr_client);
+}
+
+static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall);
+ struct nfs4_client *clp = clr->clr_client;
+ struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
+
+ nfsd4_cb_done_sequence(task, clp);
+
+ if (current_rpc_client != task->tk_client) {
+ /* We're shutting down or changing cl_cb_client; leave
+ * it to nfsd4_process_cb_update to restart the call if
+ * necessary. */
+ return;
+ }
+
+ if (cb->cb_done)
+ return;
+
+ if (task->tk_status)
+ printk("%s: clp %p cb_client %p fp %p failed with status %d\n",
+ __func__,
+ clp,
+ clp->cl_cb_client,
+ clr->clr_file,
+ task->tk_status);
+
+ switch (task->tk_status) {
+ case 0:
+ goto done;
+
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ task->tk_status = 0;
+ nomatching_layout(clr);
+ goto done;
+
+ case -NFS4ERR_DELAY:
+ /* Poll the client until it's done with the layout */
+ /* FIXME: cap number of retries.
+ * The pnfs standard states that we need to only expire
+ * the client after at-least "lease time" .eg lease-time * 2
+ * when failing to communicate a recall
+ */
+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ return;
+
+ case -NFS4ERR_BADHANDLE:
+ /* FIXME: handle more gracefully */
+ goto done;
+
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_BADIOMODE:
+ case -NFS4ERR_BADXDR:
+ case -NFS4ERR_INVAL:
+ case -NFS4ERR_NOTSUPP:
+ case -NFS4ERR_OP_NOT_IN_SESSION:
+ case -NFS4ERR_REP_TOO_BIG:
+ case -NFS4ERR_REP_TOO_BIG_TO_CACHE:
+ case -NFS4ERR_REQ_TOO_BIG:
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -NFS4ERR_TOO_MANY_OPS:
+ case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+ case -NFS4ERR_WRONG_TYPE:
+ /* We should never get these, yet it could be a result of a
+ * buggy client, therefore no BUG here.
+ */
+ goto done;
+
+ default:
+ break;
+ }
+
+ /* Network partition? */
+ nfsd4_mark_cb_down(clp, task->tk_status);
+done:
+ cb->cb_done = true;
+}
+
+static void nfsd4_cb_layout_release(void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall);
+
+ put_layoutrecall(clr);
+}
+
+static const struct rpc_call_ops nfsd4_cb_layout_ops = {
+ .rpc_call_prepare = nfsd4_cb_layout_prepare,
+ .rpc_call_done = nfsd4_cb_layout_done,
+ .rpc_release = nfsd4_cb_layout_release,
+};
+
+/*
+ * Called with state lock.
+ */
+void
+nfsd4_cb_layout(struct nfs4_layoutrecall *clr)
+{
+ struct nfsd4_callback *cb = &clr->clr_recall;
+
+ cb->cb_op = clr;
+ cb->cb_clp = clr->clr_client;
+ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT];
+ cb->cb_msg.rpc_argp = cb;
+ cb->cb_msg.rpc_resp = cb;
+ cb->cb_msg.rpc_cred = callback_cred;
+
+ cb->cb_ops = &nfsd4_cb_layout_ops;
+ run_nfsd4_cb(cb);
+}
+
+static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall);
+
+ nfsd4_cb_prepare_sequence(task, cb, cbnd->nd_client);
+}
+
+static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall);
+ struct nfs4_client *clp = cbnd->nd_client;
+
+ nfsd4_cb_done_sequence(task, clp);
+
+ dprintk("%s: clp %p cb_client %p: status %d\n",
+ __func__,
+ clp,
+ clp->cl_cb_client,
+ task->tk_status);
+
+ if (task->tk_status == -EIO) {
+ /* Network partition? */
+ nfsd4_mark_cb_down(clp, task->tk_status);
+ }
+ cb->cb_done = true;
+}
+
+static void nfsd4_cb_device_release(void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall);
+
+ kfree(cbnd);
+}
+
+static const struct rpc_call_ops nfsd4_cb_device_ops = {
+ .rpc_call_prepare = nfsd4_cb_device_prepare,
+ .rpc_call_done = nfsd4_cb_device_done,
+ .rpc_release = nfsd4_cb_device_release,
+};
+
+/*
+ * Called with state lock.
+ */
+void
+nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd)
+{
+ struct nfsd4_callback *cb = &cbnd->nd_recall;
+
+ cb->cb_op = cbnd;
+ cb->cb_clp = cbnd->nd_client;
+ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE];
+ cb->cb_msg.rpc_argp = cb;
+ cb->cb_msg.rpc_resp = cb;
+ cb->cb_msg.rpc_cred = callback_cred;
+
+ cb->cb_ops = &nfsd4_cb_device_ops;
+ run_nfsd4_cb(cb);
+}
+#endif /* CONFIG_PNFSD */
diff -up linux-2.6.38.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4pnfsd.c
--- linux-2.6.38.noarch/fs/nfsd/nfs4pnfsd.c.orig 2011-03-26 07:57:44.286821208 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfs4pnfsd.c 2011-03-26 07:57:44.286821208 -0400
@@ -0,0 +1,1688 @@
+/******************************************************************************
+ *
+ * (c) 2007 Network Appliance, Inc. All Rights Reserved.
+ * (c) 2009 NetApp. All Rights Reserved.
+ *
+ * NetApp provides this source code under the GPL v2 License.
+ * The GPL v2 license is available at
+ * http://opensource.org/licenses/gpl-license.php.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************/
+
+#include "pnfsd.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+/* Globals */
+static u32 current_layoutid = 1;
+
+/*
+ * Currently used for manipulating the layout state.
+ */
+static DEFINE_SPINLOCK(layout_lock);
+
+#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP)
+# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock))
+#else
+# define BUG_ON_UNLOCKED_LAYOUT()
+#endif
+
+/*
+ * Layout state - NFSv4.1 pNFS
+ */
+static struct kmem_cache *pnfs_layout_slab;
+static struct kmem_cache *pnfs_layoutrecall_slab;
+
+/* hash table for nfsd4_pnfs_deviceid.sbid */
+#define SBID_HASH_BITS 8
+#define SBID_HASH_SIZE (1 << SBID_HASH_BITS)
+#define SBID_HASH_MASK (SBID_HASH_SIZE - 1)
+
+struct sbid_tracker {
+ u64 id;
+ struct super_block *sb;
+ struct list_head hash;
+};
+
+static u64 current_sbid;
+static struct list_head sbid_hashtbl[SBID_HASH_SIZE];
+
+static inline unsigned long
+sbid_hashval(struct super_block *sb)
+{
+ return hash_ptr(sb, SBID_HASH_BITS);
+}
+
+static inline struct sbid_tracker *
+alloc_sbid(void)
+{
+ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL);
+}
+
+static void
+destroy_sbid(struct sbid_tracker *sbid)
+{
+ spin_lock(&layout_lock);
+ list_del(&sbid->hash);
+ spin_unlock(&layout_lock);
+ kfree(sbid);
+}
+
+void
+nfsd4_free_pnfs_slabs(void)
+{
+ int i;
+ struct sbid_tracker *sbid;
+
+ nfsd4_free_slab(&pnfs_layout_slab);
+ nfsd4_free_slab(&pnfs_layoutrecall_slab);
+
+ for (i = 0; i < SBID_HASH_SIZE; i++) {
+ while (!list_empty(&sbid_hashtbl[i])) {
+ sbid = list_first_entry(&sbid_hashtbl[i],
+ struct sbid_tracker,
+ hash);
+ destroy_sbid(sbid);
+ }
+ }
+}
+
+int
+nfsd4_init_pnfs_slabs(void)
+{
+ int i;
+
+ pnfs_layout_slab = kmem_cache_create("pnfs_layouts",
+ sizeof(struct nfs4_layout), 0, 0, NULL);
+ if (pnfs_layout_slab == NULL)
+ return -ENOMEM;
+ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls",
+ sizeof(struct nfs4_layoutrecall), 0, 0, NULL);
+ if (pnfs_layoutrecall_slab == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < SBID_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&sbid_hashtbl[i]);
+ }
+
+ return 0;
+}
+
+/* XXX: Need to implement the notify types and track which
+ * clients have which devices. */
+void pnfs_set_device_notify(clientid_t *clid, unsigned int types)
+{
+ struct nfs4_client *clp;
+ dprintk("%s: -->\n", __func__);
+
+ nfs4_lock_state();
+ /* Indicate that client has a device so we can only notify
+ * the correct clients */
+ clp = find_confirmed_client(clid);
+ if (clp) {
+ atomic_inc(&clp->cl_deviceref);
+ dprintk("%s: Incr device count (clnt %p) to %d\n",
+ __func__, clp, atomic_read(&clp->cl_deviceref));
+ }
+ nfs4_unlock_state();
+}
+
+/* Clear notifications for this client
+ * XXX: Do we need to loop through a clean up all
+ * krefs when nfsd cleans up the client? */
+void pnfs_clear_device_notify(struct nfs4_client *clp)
+{
+ atomic_dec(&clp->cl_deviceref);
+ dprintk("%s: Decr device count (clnt %p) to %d\n",
+ __func__, clp, atomic_read(&clp->cl_deviceref));
+}
+
+static struct nfs4_layout_state *
+alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp,
+ stateid_t *stateid)
+{
+ struct nfs4_layout_state *new;
+
+ /* FIXME: use a kmem_cache */
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return new;
+ get_nfs4_file(fp);
+ INIT_LIST_HEAD(&new->ls_perfile);
+ INIT_LIST_HEAD(&new->ls_layouts);
+ kref_init(&new->ls_ref);
+ new->ls_client = clp;
+ new->ls_file = fp;
+ new->ls_stateid.si_boot = stateid->si_boot;
+ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */
+ new->ls_stateid.si_generation = 1;
+ spin_lock(&layout_lock);
+ new->ls_stateid.si_fileid = current_layoutid++;
+ list_add(&new->ls_perfile, &fp->fi_layout_states);
+ spin_unlock(&layout_lock);
+ return new;
+}
+
+static inline void
+get_layout_state(struct nfs4_layout_state *ls)
+{
+ kref_get(&ls->ls_ref);
+}
+
+static void
+destroy_layout_state_common(struct nfs4_layout_state *ls)
+{
+ struct nfs4_file *fp = ls->ls_file;
+
+ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp,
+ ls->ls_client);
+ BUG_ON(!list_empty(&ls->ls_layouts));
+ kfree(ls);
+ put_nfs4_file(fp);
+}
+
+static void
+destroy_layout_state(struct kref *kref)
+{
+ struct nfs4_layout_state *ls =
+ container_of(kref, struct nfs4_layout_state, ls_ref);
+
+ spin_lock(&layout_lock);
+ list_del(&ls->ls_perfile);
+ spin_unlock(&layout_lock);
+ destroy_layout_state_common(ls);
+}
+
+static void
+destroy_layout_state_locked(struct kref *kref)
+{
+ struct nfs4_layout_state *ls =
+ container_of(kref, struct nfs4_layout_state, ls_ref);
+
+ list_del(&ls->ls_perfile);
+ destroy_layout_state_common(ls);
+}
+
+static inline void
+put_layout_state(struct nfs4_layout_state *ls)
+{
+ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
+ atomic_read(&ls->ls_ref.refcount));
+ kref_put(&ls->ls_ref, destroy_layout_state);
+}
+
+static inline void
+put_layout_state_locked(struct nfs4_layout_state *ls)
+{
+ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
+ atomic_read(&ls->ls_ref.refcount));
+ kref_put(&ls->ls_ref, destroy_layout_state_locked);
+}
+
+/*
+ * Search the fp->fi_layout_state list for a layout state with the clientid.
+ * If not found, then this is a 'first open/delegation/lock stateid' from
+ * the client for this file.
+ * Called under the layout_lock.
+ */
+static struct nfs4_layout_state *
+find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_layout_state *ls;
+
+ BUG_ON_UNLOCKED_LAYOUT();
+ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) {
+ if (ls->ls_client == clp) {
+ dprintk("pNFS %s: before GET ls %p ls_ref %d\n",
+ __func__, ls,
+ atomic_read(&ls->ls_ref.refcount));
+ get_layout_state(ls);
+ return ls;
+ }
+ }
+ return NULL;
+}
+
+static __be32
+verify_stateid(struct nfs4_file *fp, stateid_t *stateid)
+{
+ struct nfs4_stateid *local = NULL;
+ struct nfs4_delegation *temp = NULL;
+
+ /* check if open or lock stateid */
+ local = find_stateid(stateid, RD_STATE);
+ if (local)
+ return 0;
+ temp = find_delegation_stateid(fp->fi_inode, stateid);
+ if (temp)
+ return 0;
+ return nfserr_bad_stateid;
+}
+
+/*
+ * nfs4_preocess_layout_stateid ()
+ *
+ * We have looked up the nfs4_file corresponding to the current_fh, and
+ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op()
+ * that make sense with a layout stateid.
+ *
+ * Called with the state_lock held
+ * Returns zero and stateid is updated, or error.
+ *
+ * Note: the struct nfs4_layout_state pointer is only set by layoutget.
+ */
+static __be32
+nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp,
+ stateid_t *stateid, struct nfs4_layout_state **lsp)
+{
+ struct nfs4_layout_state *ls = NULL;
+ __be32 status = 0;
+
+ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp);
+
+ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__,
+ STATEID_VAL(stateid));
+
+ status = nfs4_check_stateid(stateid);
+ if (status)
+ goto out;
+
+ /* Is this the first use of this layout ? */
+ spin_lock(&layout_lock);
+ ls = find_get_layout_state(clp, fp);
+ spin_unlock(&layout_lock);
+ if (!ls) {
+ /* Only alloc layout state on layoutget (which sets lsp). */
+ if (!lsp) {
+ dprintk("%s ERROR: Not layoutget & no layout stateid\n",
+ __func__);
+ status = nfserr_bad_stateid;
+ goto out;
+ }
+ dprintk("%s Initial stateid for layout: file %p client %p\n",
+ __func__, fp, clp);
+
+ /* verify input stateid */
+ status = verify_stateid(fp, stateid);
+ if (status) {
+ dprintk("%s ERROR: invalid open/deleg/lock stateid\n",
+ __func__);
+ goto out;
+ }
+ ls = alloc_init_layout_state(clp, fp, stateid);
+ if (!ls) {
+ dprintk("%s pNFS ERROR: no memory for layout state\n",
+ __func__);
+ status = nfserr_resource;
+ goto out;
+ }
+ } else {
+ dprintk("%s Not initial stateid. Layout state %p file %p\n",
+ __func__, ls, fp);
+
+ /* BAD STATEID */
+ status = nfserr_bad_stateid;
+ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque,
+ sizeof(stateid_opaque_t)) != 0) {
+
+ /* if a LAYOUTGET operation and stateid is a valid
+ * open/deleg/lock stateid, accept it as a parallel
+ * initial layout stateid
+ */
+ if (lsp && ((verify_stateid(fp, stateid)) == 0)) {
+ dprintk("%s parallel initial layout state\n",
+ __func__);
+ goto verified;
+ }
+
+ dprintk("%s ERROR bad opaque in stateid 1\n", __func__);
+ goto out_put;
+ }
+
+ /* stateid is a valid layout stateid for this file. */
+ if (stateid->si_generation > ls->ls_stateid.si_generation) {
+ dprintk("%s bad stateid 1\n", __func__);
+ goto out_put;
+ }
+ }
+verified:
+ status = 0;
+
+ /* Return the layout state if requested */
+ if (lsp) {
+ get_layout_state(ls);
+ *lsp = ls;
+ }
+ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__,
+ STATEID_VAL(&ls->ls_stateid));
+out_put:
+ dprintk("%s PUT LO STATE:\n", __func__);
+ put_layout_state(ls);
+out:
+ dprintk("<-- %s status %d\n", __func__, htonl(status));
+
+ return status;
+}
+
+static inline struct nfs4_layout *
+alloc_layout(void)
+{
+ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL);
+}
+
+static inline void
+free_layout(struct nfs4_layout *lp)
+{
+ kmem_cache_free(pnfs_layout_slab, lp);
+}
+
+#define update_layout_stateid(ls, sid) { \
+ update_stateid(&(ls)->ls_stateid); \
+ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", \
+ __func__, (ls)->ls_stateid.si_generation, (ls)); \
+ memcpy((sid), &(ls)->ls_stateid, sizeof(stateid_t)); \
+}
+
+static void
+init_layout(struct nfs4_layout_state *ls,
+ struct nfs4_layout *lp,
+ struct nfs4_file *fp,
+ struct nfs4_client *clp,
+ struct svc_fh *current_fh,
+ struct nfsd4_layout_seg *seg,
+ stateid_t *stateid)
+{
+ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__,
+ ls, lp, clp, fp, fp->fi_inode);
+
+ get_nfs4_file(fp);
+ lp->lo_client = clp;
+ lp->lo_file = fp;
+ get_layout_state(ls);
+ lp->lo_state = ls;
+ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg));
+ spin_lock(&layout_lock);
+ update_layout_stateid(ls, stateid);
+ list_add_tail(&lp->lo_perstate, &ls->ls_layouts);
+ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts);
+ list_add_tail(&lp->lo_perfile, &fp->fi_layouts);
+ spin_unlock(&layout_lock);
+ dprintk("pNFS %s end\n", __func__);
+}
+
+static void
+dequeue_layout(struct nfs4_layout *lp)
+{
+ BUG_ON_UNLOCKED_LAYOUT();
+ list_del(&lp->lo_perclnt);
+ list_del(&lp->lo_perfile);
+ list_del(&lp->lo_perstate);
+}
+
+static void
+destroy_layout(struct nfs4_layout *lp)
+{
+ struct nfs4_client *clp;
+ struct nfs4_file *fp;
+ struct nfs4_layout_state *ls;
+
+ BUG_ON_UNLOCKED_LAYOUT();
+ clp = lp->lo_client;
+ fp = lp->lo_file;
+ ls = lp->lo_state;
+ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n",
+ __func__, lp, clp, fp, fp->fi_inode,
+ list_empty(&ls->ls_layouts));
+
+ kmem_cache_free(pnfs_layout_slab, lp);
+ /* release references taken by init_layout */
+ put_layout_state_locked(ls);
+ put_nfs4_file(fp);
+}
+
+void fs_layout_return(struct super_block *sb, struct inode *ino,
+ struct nfsd4_pnfs_layoutreturn *lrp, int flags,
+ void *recall_cookie)
+{
+ int ret;
+
+ if (unlikely(!sb->s_pnfs_op->layout_return))
+ return;
+
+ lrp->lr_flags = flags;
+ lrp->args.lr_cookie = recall_cookie;
+
+ if (!ino) /* FSID or ALL */
+ ino = sb->s_root->d_inode;
+
+ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args);
+ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx "
+ "cookie = %p flags 0x%x status=%d\n",
+ __func__, ino->i_ino, lrp->args.lr_seg.iomode,
+ lrp->args.lr_seg.offset, lrp->args.lr_seg.length,
+ recall_cookie, flags, ret);
+}
+
+static u64
+alloc_init_sbid(struct super_block *sb)
+{
+ struct sbid_tracker *sbid;
+ struct sbid_tracker *new = alloc_sbid();
+ unsigned long hash_idx = sbid_hashval(sb);
+ u64 id = 0;
+
+ if (likely(new)) {
+ spin_lock(&layout_lock);
+ id = ++current_sbid;
+ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK);
+ id = new->id;
+ BUG_ON(id == 0);
+ new->sb = sb;
+
+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash)
+ if (sbid->sb == sb) {
+ kfree(new);
+ id = sbid->id;
+ spin_unlock(&layout_lock);
+ return id;
+ }
+ list_add(&new->hash, &sbid_hashtbl[hash_idx]);
+ spin_unlock(&layout_lock);
+ }
+ return id;
+}
+
+struct super_block *
+find_sbid_id(u64 id)
+{
+ struct sbid_tracker *sbid;
+ struct super_block *sb = NULL;
+ unsigned long hash_idx = id & SBID_HASH_MASK;
+ int pos = 0;
+
+ spin_lock(&layout_lock);
+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
+ pos++;
+ if (sbid->id != id)
+ continue;
+ if (pos > 1)
+ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
+ sb = sbid->sb;
+ break;
+ }
+ spin_unlock(&layout_lock);
+ return sb;
+}
+
+u64
+find_create_sbid(struct super_block *sb)
+{
+ struct sbid_tracker *sbid;
+ unsigned long hash_idx = sbid_hashval(sb);
+ int pos = 0;
+ u64 id = 0;
+
+ spin_lock(&layout_lock);
+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
+ pos++;
+ if (sbid->sb != sb)
+ continue;
+ if (pos > 1)
+ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
+ id = sbid->id;
+ break;
+ }
+ spin_unlock(&layout_lock);
+
+ if (!id)
+ id = alloc_init_sbid(sb);
+
+ return id;
+}
+
+/*
+ * Create a layoutrecall structure
+ * An optional layoutrecall can be cloned (except for the layoutrecall lists)
+ */
+static struct nfs4_layoutrecall *
+alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl,
+ struct nfs4_client *clp,
+ struct nfs4_file *lrfile)
+{
+ struct nfs4_layoutrecall *clr;
+
+ dprintk("NFSD %s\n", __func__);
+ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL);
+ if (clr == NULL)
+ return clr;
+
+ dprintk("NFSD %s -->\n", __func__);
+
+ memset(clr, 0, sizeof(*clr));
+ if (lrfile)
+ get_nfs4_file(lrfile);
+ clr->clr_client = clp;
+ clr->clr_file = lrfile;
+ clr->cb = *cbl;
+
+ kref_init(&clr->clr_ref);
+ INIT_LIST_HEAD(&clr->clr_perclnt);
+ INIT_WORK(&clr->clr_recall.cb_work, nfsd4_do_callback_rpc);
+
+ dprintk("NFSD %s return %p\n", __func__, clr);
+ return clr;
+}
+
+static void
+get_layoutrecall(struct nfs4_layoutrecall *clr)
+{
+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
+ atomic_read(&clr->clr_ref.refcount));
+ kref_get(&clr->clr_ref);
+}
+
+static void
+destroy_layoutrecall(struct kref *kref)
+{
+ struct nfs4_layoutrecall *clr =
+ container_of(kref, struct nfs4_layoutrecall, clr_ref);
+ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr,
+ clr->clr_file, clr->clr_client);
+ BUG_ON(!list_empty(&clr->clr_perclnt));
+ if (clr->clr_file)
+ put_nfs4_file(clr->clr_file);
+ kmem_cache_free(pnfs_layoutrecall_slab, clr);
+}
+
+int
+put_layoutrecall(struct nfs4_layoutrecall *clr)
+{
+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
+ atomic_read(&clr->clr_ref.refcount));
+ return kref_put(&clr->clr_ref, destroy_layoutrecall);
+}
+
+void *
+layoutrecall_done(struct nfs4_layoutrecall *clr)
+{
+ void *recall_cookie = clr->cb.cbl_cookie;
+ struct nfs4_layoutrecall *parent = clr->parent;
+
+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
+ atomic_read(&clr->clr_ref.refcount));
+ BUG_ON_UNLOCKED_LAYOUT();
+ list_del_init(&clr->clr_perclnt);
+ put_layoutrecall(clr);
+
+ if (parent && !put_layoutrecall(parent))
+ recall_cookie = NULL;
+
+ return recall_cookie;
+}
+
+/*
+ * get_state() and cb_get_state() are
+ */
+void
+release_pnfs_ds_dev_list(struct nfs4_stateid *stp)
+{
+ struct pnfs_ds_dev_entry *ddp;
+
+ while (!list_empty(&stp->st_pnfs_ds_id)) {
+ ddp = list_entry(stp->st_pnfs_ds_id.next,
+ struct pnfs_ds_dev_entry, dd_dev_entry);
+ list_del(&ddp->dd_dev_entry);
+ kfree(ddp);
+ }
+}
+
+static int
+nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid)
+{
+ struct pnfs_ds_dev_entry *ddp;
+
+ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL);
+ if (!ddp)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&ddp->dd_dev_entry);
+ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id);
+ ddp->dd_dsid = dsid;
+ return 0;
+}
+
+/*
+ * are two octet ranges overlapping?
+ * start1 last1
+ * |-----------------|
+ * start2 last2
+ * |----------------|
+ */
+static inline int
+lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
+{
+ u64 start1 = l1->offset;
+ u64 last1 = last_byte_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 last2 = last_byte_offset(start2, l2->length);
+ int ret;
+
+ /* if last1 == start2 there's a single byte overlap */
+ ret = (last2 >= start1) && (last1 >= start2);
+ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__,
+ l1->offset, l1->length, l2->offset, l2->length, ret);
+ return ret;
+}
+
+static inline int
+same_fsid_major(struct nfs4_fsid *fsid, u64 major)
+{
+ return fsid->major == major;
+}
+
+static inline int
+same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh)
+{
+ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid);
+}
+
+/*
+ * find a layout recall conflicting with the specified layoutget
+ */
+static int
+is_layout_recalled(struct nfs4_client *clp,
+ struct svc_fh *current_fh,
+ struct nfsd4_layout_seg *seg)
+{
+ struct nfs4_layoutrecall *clr;
+
+ spin_lock(&layout_lock);
+ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) {
+ if (clr->cb.cbl_seg.layout_type != seg->layout_type)
+ continue;
+ if (clr->cb.cbl_recall_type == RETURN_ALL)
+ goto found;
+ if (clr->cb.cbl_recall_type == RETURN_FSID) {
+ if (same_fsid(&clr->cb.cbl_fsid, current_fh))
+ goto found;
+ else
+ continue;
+ }
+ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE);
+ if (clr->cb.cbl_seg.clientid == seg->clientid &&
+ lo_seg_overlapping(&clr->cb.cbl_seg, seg))
+ goto found;
+ }
+ spin_unlock(&layout_lock);
+ return 0;
+found:
+ spin_unlock(&layout_lock);
+ return 1;
+}
+
+/*
+ * are two octet ranges overlapping or adjacent?
+ */
+static inline int
+lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ /* is end1 == start2 ranges are adjacent */
+ return (end2 >= start1) && (end1 >= start2);
+}
+
+static void
+extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg)
+{
+ u64 lo_start = lo->offset;
+ u64 lo_end = end_offset(lo_start, lo->length);
+ u64 lg_start = lg->offset;
+ u64 lg_end = end_offset(lg_start, lg->length);
+
+ /* lo already covers lg? */
+ if (lo_start <= lg_start && lg_end <= lo_end)
+ return;
+
+ /* extend start offset */
+ if (lo_start > lg_start)
+ lo_start = lg_start;
+
+ /* extend end offset */
+ if (lo_end < lg_end)
+ lo_end = lg_end;
+
+ lo->offset = lo_start;
+ lo->length = (lo_end == NFS4_MAX_UINT64) ?
+ lo_end : lo_end - lo_start;
+}
+
+static struct nfs4_layout *
+merge_layout(struct nfs4_file *fp,
+ struct nfs4_client *clp,
+ struct nfsd4_layout_seg *seg)
+{
+ struct nfs4_layout *lp = NULL;
+
+ spin_lock(&layout_lock);
+ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile)
+ if (lp->lo_seg.layout_type == seg->layout_type &&
+ lp->lo_seg.clientid == seg->clientid &&
+ lp->lo_seg.iomode == seg->iomode &&
+ lo_seg_mergeable(&lp->lo_seg, seg)) {
+ extend_layout(&lp->lo_seg, seg);
+ break;
+ }
+ spin_unlock(&layout_lock);
+
+ return lp;
+}
+
+__be32
+nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp,
+ struct exp_xdr_stream *xdr)
+{
+ u32 status;
+ __be32 nfserr;
+ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode;
+ struct super_block *sb = ino->i_sb;
+ int can_merge;
+ struct nfs4_file *fp;
+ struct nfs4_client *clp;
+ struct nfs4_layout *lp = NULL;
+ struct nfs4_layout_state *ls = NULL;
+ struct nfsd4_pnfs_layoutget_arg args = {
+ .lg_minlength = lgp->lg_minlength,
+ .lg_fh = &lgp->lg_fhp->fh_handle,
+ };
+ struct nfsd4_pnfs_layoutget_res res = {
+ .lg_seg = lgp->lg_seg,
+ };
+
+ dprintk("NFSD: %s Begin\n", __func__);
+
+ args.lg_sbid = find_create_sbid(sb);
+ if (!args.lg_sbid) {
+ nfserr = nfserr_layouttrylater;
+ goto out;
+ }
+
+ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL &&
+ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type);
+
+ nfs4_lock_state();
+ fp = find_alloc_file(ino, lgp->lg_fhp);
+ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid);
+ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp);
+ if (!fp || !clp) {
+ nfserr = nfserr_inval;
+ goto out_unlock;
+ }
+
+ /* Check decoded layout stateid */
+ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls);
+ if (nfserr)
+ goto out_unlock;
+
+ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) {
+ nfserr = nfserr_recallconflict;
+ goto out;
+ }
+
+ /* pre-alloc layout in case we can't merge after we call
+ * the file system
+ */
+ lp = alloc_layout();
+ if (!lp) {
+ nfserr = nfserr_layouttrylater;
+ goto out_unlock;
+ }
+
+ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd "
+ "iomode %u offset %llu length %llu\n",
+ __func__, lgp->lg_seg.layout_type,
+ exp_xdr_qbytes(xdr->end - xdr->p),
+ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length);
+
+ /* FIXME: need to eliminate the use of the state lock */
+ nfs4_unlock_state();
+ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res);
+ nfs4_lock_state();
+
+ dprintk("pNFS %s: post-export status %u "
+ "iomode %u offset %llu length %llu\n",
+ __func__, status, res.lg_seg.iomode,
+ res.lg_seg.offset, res.lg_seg.length);
+
+ /*
+ * The allowable error codes for the layout_get pNFS export
+ * operations vector function (from the file system) can be
+ * expanded as needed to include other errors defined for
+ * the RFC 5561 LAYOUTGET operation.
+ */
+ switch (status) {
+ case 0:
+ nfserr = NFS4_OK;
+ break;
+ case NFS4ERR_ACCESS:
+ case NFS4ERR_BADIOMODE:
+ /* No support for LAYOUTIOMODE4_RW layouts */
+ case NFS4ERR_BADLAYOUT:
+ /* No layout matching loga_minlength rules */
+ case NFS4ERR_INVAL:
+ case NFS4ERR_IO:
+ case NFS4ERR_LAYOUTTRYLATER:
+ case NFS4ERR_LAYOUTUNAVAILABLE:
+ case NFS4ERR_LOCKED:
+ case NFS4ERR_NOSPC:
+ case NFS4ERR_RECALLCONFLICT:
+ case NFS4ERR_SERVERFAULT:
+ case NFS4ERR_TOOSMALL:
+ /* Requested layout too big for loga_maxcount */
+ case NFS4ERR_WRONG_TYPE:
+ /* Not a regular file */
+ nfserr = cpu_to_be32(status);
+ goto out_freelayout;
+ default:
+ BUG();
+ nfserr = nfserr_serverfault;
+ }
+
+ lgp->lg_seg = res.lg_seg;
+ lgp->lg_roc = res.lg_return_on_close;
+
+ /* SUCCESS!
+ * Can the new layout be merged into an existing one?
+ * If so, free unused layout struct
+ */
+ if (can_merge && merge_layout(fp, clp, &res.lg_seg))
+ goto out_freelayout;
+
+ /* Can't merge, so let's initialize this new layout */
+ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg, &lgp->lg_sid);
+out_unlock:
+ if (ls)
+ put_layout_state(ls);
+ if (fp)
+ put_nfs4_file(fp);
+ nfs4_unlock_state();
+out:
+ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp,
+ be32_to_cpu(nfserr));
+ return nfserr;
+out_freelayout:
+ free_layout(lp);
+ goto out_unlock;
+}
+
+static void
+trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr)
+{
+ u64 lo_start = lo->offset;
+ u64 lo_end = end_offset(lo_start, lo->length);
+ u64 lr_start = lr->offset;
+ u64 lr_end = end_offset(lr_start, lr->length);
+
+ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__,
+ lo->offset, lo->length, lr->offset, lr->length);
+
+ /* lr fully covers lo? */
+ if (lr_start <= lo_start && lo_end <= lr_end) {
+ lo->length = 0;
+ goto out;
+ }
+
+ /*
+ * split not supported yet. retain layout segment.
+ * remains must be returned by the client
+ * on the final layout return.
+ */
+ if (lo_start < lr_start && lr_end < lo_end) {
+ dprintk("%s: split not supported\n", __func__);
+ goto out;
+ }
+
+ if (lo_start < lr_start)
+ lo_end = lr_start - 1;
+ else /* lr_end < lo_end */
+ lo_start = lr_end + 1;
+
+ lo->offset = lo_start;
+ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start;
+out:
+ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length);
+}
+
+static int
+pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp,
+ struct nfsd4_pnfs_layoutreturn *lrp,
+ struct nfs4_layout_state *ls)
+{
+ int layouts_found = 0;
+ struct nfs4_layout *lp, *nextlp;
+
+ dprintk("%s: clp %p fp %p\n", __func__, clp, fp);
+ spin_lock(&layout_lock);
+ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) {
+ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n",
+ __func__, lp,
+ lp->lo_client, clp,
+ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type,
+ lp->lo_seg.iomode, lrp->args.lr_seg.iomode);
+ if (lp->lo_client != clp ||
+ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type ||
+ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode &&
+ lrp->args.lr_seg.iomode != IOMODE_ANY) ||
+ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg))
+ continue;
+ layouts_found++;
+ trim_layout(&lp->lo_seg, &lrp->args.lr_seg);
+ if (!lp->lo_seg.length) {
+ lrp->lrs_present = 0;
+ dequeue_layout(lp);
+ destroy_layout(lp);
+ }
+ }
+ if (ls && layouts_found && lrp->lrs_present)
+ update_layout_stateid(ls, &lrp->lr_sid);
+ spin_unlock(&layout_lock);
+
+ return layouts_found;
+}
+
+static int
+pnfs_return_client_layouts(struct nfs4_client *clp,
+ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid)
+{
+ int layouts_found = 0;
+ struct nfs4_layout *lp, *nextlp;
+
+ spin_lock(&layout_lock);
+ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) {
+ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type ||
+ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode &&
+ lrp->args.lr_seg.iomode != IOMODE_ANY))
+ continue;
+
+ if (lrp->args.lr_return_type == RETURN_FSID &&
+ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid))
+ continue;
+
+ layouts_found++;
+ dequeue_layout(lp);
+ destroy_layout(lp);
+ }
+ spin_unlock(&layout_lock);
+
+ return layouts_found;
+}
+
+static int
+recall_return_perfect_match(struct nfs4_layoutrecall *clr,
+ struct nfsd4_pnfs_layoutreturn *lrp,
+ struct nfs4_file *fp,
+ struct svc_fh *current_fh)
+{
+ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode ||
+ clr->cb.cbl_recall_type != lrp->args.lr_return_type)
+ return 0;
+
+ return (clr->cb.cbl_recall_type == RETURN_FILE &&
+ clr->clr_file == fp &&
+ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset &&
+ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) ||
+
+ (clr->cb.cbl_recall_type == RETURN_FSID &&
+ same_fsid(&clr->cb.cbl_fsid, current_fh)) ||
+
+ clr->cb.cbl_recall_type == RETURN_ALL;
+}
+
+static int
+recall_return_partial_match(struct nfs4_layoutrecall *clr,
+ struct nfsd4_pnfs_layoutreturn *lrp,
+ struct nfs4_file *fp,
+ struct svc_fh *current_fh)
+{
+ /* iomode matching? */
+ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode &&
+ clr->cb.cbl_seg.iomode != IOMODE_ANY &&
+ lrp->args.lr_seg.iomode != IOMODE_ANY)
+ return 0;
+
+ if (clr->cb.cbl_recall_type == RETURN_ALL ||
+ lrp->args.lr_return_type == RETURN_ALL)
+ return 1;
+
+ /* fsid matches? */
+ if (clr->cb.cbl_recall_type == RETURN_FSID ||
+ lrp->args.lr_return_type == RETURN_FSID)
+ return same_fsid(&clr->cb.cbl_fsid, current_fh);
+
+ /* file matches, range overlapping? */
+ return clr->clr_file == fp &&
+ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg);
+}
+
+int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh,
+ struct nfsd4_pnfs_layoutreturn *lrp)
+{
+ int status = 0;
+ int layouts_found = 0;
+ struct inode *ino = current_fh->fh_dentry->d_inode;
+ struct nfs4_file *fp = NULL;
+ struct nfs4_client *clp;
+ struct nfs4_layout_state *ls = NULL;
+ struct nfs4_layoutrecall *clr, *nextclr;
+ u64 ex_fsid = current_fh->fh_export->ex_fsid;
+ void *recall_cookie = NULL;
+
+ dprintk("NFSD: %s\n", __func__);
+
+ nfs4_lock_state();
+ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid);
+ if (!clp)
+ goto out;
+
+ if (lrp->args.lr_return_type == RETURN_FILE) {
+ fp = find_file(ino);
+ if (!fp) {
+ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for "
+ "ino %p:%lu\n",
+ __func__, ino, ino ? ino->i_ino : 0L);
+ goto out;
+ }
+
+ /* Check the stateid */
+ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino);
+ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, &ls);
+ if (status)
+ goto out_put_file;
+
+ /* update layouts */
+ layouts_found = pnfs_return_file_layouts(clp, fp, lrp, ls);
+ /* optimize for the all-empty case */
+ if (list_empty(&fp->fi_layouts))
+ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
+ } else {
+ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid);
+ }
+
+ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d "
+ "return_type %d fsid 0x%llx offset %llu length %llu: "
+ "layouts_found %d\n",
+ __func__, clp, fp, lrp->args.lr_seg.layout_type,
+ lrp->args.lr_seg.iomode, lrp->args.lr_return_type,
+ ex_fsid,
+ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found);
+
+ /* update layoutrecalls
+ * note: for RETURN_{FSID,ALL}, fp may be NULL
+ */
+ spin_lock(&layout_lock);
+ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls,
+ clr_perclnt) {
+ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type)
+ continue;
+
+ if (recall_return_perfect_match(clr, lrp, fp, current_fh))
+ recall_cookie = layoutrecall_done(clr);
+ else if (layouts_found &&
+ recall_return_partial_match(clr, lrp, fp, current_fh))
+ clr->clr_time = CURRENT_TIME;
+ }
+ spin_unlock(&layout_lock);
+
+out_put_file:
+ if (fp)
+ put_nfs4_file(fp);
+ if (ls)
+ put_layout_state(ls);
+out:
+ nfs4_unlock_state();
+
+ /* call exported filesystem layout_return (ignore return-code) */
+ fs_layout_return(sb, ino, lrp, 0, recall_cookie);
+
+ dprintk("pNFS %s: exit status %d \n", __func__, status);
+ return status;
+}
+
+/*
+ * PNFS Metadata server export operations callback for get_state
+ *
+ * called by the cluster fs when it receives a get_state() from a data
+ * server.
+ * returns status, or pnfs_get_state* with pnfs_get_state->status set.
+ *
+ */
+int
+nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg)
+{
+ struct nfs4_stateid *stp;
+ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */
+ int status = -EINVAL;
+ struct inode *ino;
+ struct nfs4_delegation *dl;
+ stateid_t *stid = (stateid_t *)&arg->stid;
+
+ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__,
+ STATEID_VAL(stid), arg->ino);
+
+ nfs4_lock_state();
+ stp = find_stateid(stid, flags);
+ if (!stp) {
+ ino = iget_locked(sb, arg->ino);
+ if (!ino)
+ goto out;
+
+ if (ino->i_state & I_NEW) {
+ iget_failed(ino);
+ goto out;
+ }
+
+ dl = find_delegation_stateid(ino, stid);
+ if (dl)
+ status = 0;
+
+ iput(ino);
+ } else {
+ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */
+
+ /* arg->devid is the Data server id, set by the cluster fs */
+ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid);
+ if (status)
+ goto out;
+
+ arg->access = stp->st_access_bmap;
+ *(clientid_t *)&arg->clid =
+ stp->st_stateowner->so_client->cl_clientid;
+ }
+out:
+ nfs4_unlock_state();
+ return status;
+}
+
+static int
+cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile,
+ stateid_t *lsid)
+{
+ int found = 0;
+ struct nfs4_layout *lp;
+ struct nfs4_layout_state *ls;
+
+ spin_lock(&layout_lock);
+ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) {
+ if (lp->lo_file != lrfile)
+ continue;
+
+ ls = find_get_layout_state(clp, lrfile);
+ if (!ls) {
+ /* This shouldn't happen as the file should have a
+ * layout stateid if it has a layout.
+ */
+ printk(KERN_ERR "%s: file %p has no layout stateid\n",
+ __func__, lrfile);
+ WARN_ON(1);
+ break;
+ }
+ update_layout_stateid(ls, lsid);
+ put_layout_state_locked(ls);
+ found = 1;
+ break;
+ }
+ spin_unlock(&layout_lock);
+
+ return found;
+}
+
+static int
+cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid)
+{
+ int found = 0;
+ struct nfs4_layout *lp;
+
+ /* note: minor version unused */
+ spin_lock(&layout_lock);
+ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt)
+ if (lp->lo_file->fi_fsid.major == fsid->major) {
+ found = 1;
+ break;
+ }
+ spin_unlock(&layout_lock);
+ return found;
+}
+
+static int
+cl_has_any_layout(struct nfs4_client *clp)
+{
+ return !list_empty(&clp->cl_layouts);
+}
+
+static int
+cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl,
+ struct nfs4_file *lrfile, stateid_t *lsid)
+{
+ switch (cbl->cbl_recall_type) {
+ case RETURN_FILE:
+ return cl_has_file_layout(clp, lrfile, lsid);
+ case RETURN_FSID:
+ return cl_has_fsid_layout(clp, &cbl->cbl_fsid);
+ default:
+ return cl_has_any_layout(clp);
+ }
+}
+
+/*
+ * Called without the layout_lock.
+ */
+void
+nomatching_layout(struct nfs4_layoutrecall *clr)
+{
+ struct nfsd4_pnfs_layoutreturn lr = {
+ .args.lr_return_type = clr->cb.cbl_recall_type,
+ .args.lr_seg = clr->cb.cbl_seg,
+ };
+ struct inode *inode;
+ void *recall_cookie;
+
+ if (clr->clr_file) {
+ inode = igrab(clr->clr_file->fi_inode);
+ if (WARN_ON(!inode))
+ return;
+ } else {
+ inode = NULL;
+ }
+
+ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__,
+ clr->clr_client, clr->clr_file);
+
+ if (clr->cb.cbl_recall_type == RETURN_FILE)
+ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr,
+ NULL);
+ else
+ pnfs_return_client_layouts(clr->clr_client, &lr,
+ clr->cb.cbl_fsid.major);
+
+ spin_lock(&layout_lock);
+ recall_cookie = layoutrecall_done(clr);
+ spin_unlock(&layout_lock);
+
+ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN,
+ recall_cookie);
+ iput(inode);
+}
+
+void pnfs_expire_client(struct nfs4_client *clp)
+{
+ for (;;) {
+ struct nfs4_layoutrecall *lrp = NULL;
+
+ spin_lock(&layout_lock);
+ if (!list_empty(&clp->cl_layoutrecalls)) {
+ lrp = list_entry(clp->cl_layoutrecalls.next,
+ struct nfs4_layoutrecall, clr_perclnt);
+ get_layoutrecall(lrp);
+ }
+ spin_unlock(&layout_lock);
+ if (!lrp)
+ break;
+
+ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file);
+ BUG_ON(lrp->clr_client != clp);
+ nomatching_layout(lrp);
+ put_layoutrecall(lrp);
+ }
+
+ for (;;) {
+ struct nfs4_layout *lp = NULL;
+ struct inode *inode = NULL;
+ struct nfsd4_pnfs_layoutreturn lr;
+ bool empty = false;
+
+ spin_lock(&layout_lock);
+ if (!list_empty(&clp->cl_layouts)) {
+ lp = list_entry(clp->cl_layouts.next,
+ struct nfs4_layout, lo_perclnt);
+ inode = igrab(lp->lo_file->fi_inode);
+ memset(&lr, 0, sizeof(lr));
+ lr.args.lr_return_type = RETURN_FILE;
+ lr.args.lr_seg = lp->lo_seg;
+ empty = list_empty(&lp->lo_file->fi_layouts);
+ BUG_ON(lp->lo_client != clp);
+ dequeue_layout(lp);
+ destroy_layout(lp); /* do not access lp after this */
+ }
+ spin_unlock(&layout_lock);
+ if (!lp)
+ break;
+
+ if (WARN_ON(!inode))
+ break;
+
+ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino,
+ lp, clp);
+
+ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE,
+ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL);
+ iput(inode);
+ }
+}
+
+struct create_recall_list_arg {
+ struct nfsd4_pnfs_cb_layout *cbl;
+ struct nfs4_file *lrfile;
+ struct list_head *todolist;
+ unsigned todo_count;
+};
+
+/*
+ * look for matching layout for the given client
+ * and add a pending layout recall to the todo list
+ * if found any.
+ * returns:
+ * 0 if layouts found or negative error.
+ */
+static int
+lo_recall_per_client(struct nfs4_client *clp, void *p)
+{
+ stateid_t lsid;
+ struct nfs4_layoutrecall *pending;
+ struct create_recall_list_arg *arg = p;
+
+ memset(&lsid, 0, sizeof(lsid));
+ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid))
+ return 0;
+
+ /* Matching put done by layoutreturn */
+ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile);
+ /* out of memory, drain todo queue */
+ if (!pending)
+ return -ENOMEM;
+
+ *(stateid_t *)&pending->cb.cbl_sid = lsid;
+ list_add(&pending->clr_perclnt, arg->todolist);
+ arg->todo_count++;
+ return 0;
+}
+
+/* Create a layoutrecall structure for each client based on the
+ * original structure. */
+int
+create_layout_recall_list(struct list_head *todolist, unsigned *todo_len,
+ struct nfsd4_pnfs_cb_layout *cbl,
+ struct nfs4_file *lrfile)
+{
+ struct nfs4_client *clp;
+ struct create_recall_list_arg arg = {
+ .cbl = cbl,
+ .lrfile = lrfile,
+ .todolist = todolist,
+ };
+ int status = 0;
+
+ dprintk("%s: -->\n", __func__);
+
+ /* If client given by fs, just do single client */
+ if (cbl->cbl_seg.clientid) {
+ clp = find_confirmed_client(
+ (clientid_t *)&cbl->cbl_seg.clientid);
+ if (!clp) {
+ status = -ENOENT;
+ dprintk("%s: clientid %llx not found\n", __func__,
+ (unsigned long long)cbl->cbl_seg.clientid);
+ goto out;
+ }
+
+ status = lo_recall_per_client(clp, &arg);
+ } else {
+ /* Check all clients for layout matches */
+ status = filter_confirmed_clients(lo_recall_per_client, &arg);
+ }
+
+out:
+ *todo_len = arg.todo_count;
+ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status);
+ return status;
+}
+
+/*
+ * Recall layouts asynchronously
+ * Called with state lock.
+ */
+static int
+spawn_layout_recall(struct super_block *sb, struct list_head *todolist,
+ unsigned todo_len)
+{
+ struct nfs4_layoutrecall *pending;
+ struct nfs4_layoutrecall *parent = NULL;
+ int status = 0;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (todo_len > 1) {
+ pending = list_entry(todolist->next, struct nfs4_layoutrecall,
+ clr_perclnt);
+
+ parent = alloc_init_layoutrecall(&pending->cb, NULL,
+ pending->clr_file);
+ if (unlikely(!parent)) {
+ /* We want forward progress. If parent cannot be
+ * allocated take the first one as parent but don't
+ * execute it. Caller must check for -EAGAIN, if so
+ * When the partial recalls return,
+ * nfsd_layout_recall_cb should be called again.
+ */
+ list_del_init(&pending->clr_perclnt);
+ if (todo_len > 2) {
+ parent = pending;
+ } else {
+ parent = NULL;
+ put_layoutrecall(pending);
+ }
+ --todo_len;
+ status = -ENOMEM;
+ }
+ }
+
+ while (!list_empty(todolist)) {
+ pending = list_entry(todolist->next, struct nfs4_layoutrecall,
+ clr_perclnt);
+ list_del_init(&pending->clr_perclnt);
+ dprintk("%s: clp %p cb_client %p fp %p\n", __func__,
+ pending->clr_client,
+ pending->clr_client->cl_cb_client,
+ pending->clr_file);
+ if (unlikely(!pending->clr_client->cl_cb_client)) {
+ printk(KERN_INFO
+ "%s: clientid %08x/%08x has no callback path\n",
+ __func__,
+ pending->clr_client->cl_clientid.cl_boot,
+ pending->clr_client->cl_clientid.cl_id);
+ put_layoutrecall(pending);
+ continue;
+ }
+
+ pending->clr_time = CURRENT_TIME;
+ pending->clr_sb = sb;
+ if (parent) {
+ /* If we created a parent its initial ref count is 1.
+ * We will need to de-ref it eventually. So we just
+ * don't increment on behalf of the last one.
+ */
+ if (todo_len != 1)
+ get_layoutrecall(parent);
+ }
+ pending->parent = parent;
+ get_layoutrecall(pending);
+ /* Add to list so corresponding layoutreturn can find req */
+ list_add(&pending->clr_perclnt,
+ &pending->clr_client->cl_layoutrecalls);
+
+ nfsd4_cb_layout(pending);
+ --todo_len;
+ }
+
+ return status;
+}
+
+/*
+ * Spawn a thread to perform a recall layout
+ *
+ */
+int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode,
+ struct nfsd4_pnfs_cb_layout *cbl)
+{
+ int status;
+ struct nfs4_file *lrfile = NULL;
+ struct list_head todolist;
+ unsigned todo_len = 0;
+
+ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl);
+ BUG_ON(!cbl);
+ BUG_ON(cbl->cbl_recall_type != RETURN_FILE &&
+ cbl->cbl_recall_type != RETURN_FSID &&
+ cbl->cbl_recall_type != RETURN_ALL);
+ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode);
+ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ &&
+ cbl->cbl_seg.iomode != IOMODE_RW &&
+ cbl->cbl_seg.iomode != IOMODE_ANY);
+
+ if (nfsd_serv == NULL) {
+ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n");
+ return -ENOENT;
+ }
+
+ nfs4_lock_state();
+ status = -ENOENT;
+ if (inode) {
+ lrfile = find_file(inode);
+ if (!lrfile) {
+ dprintk("NFSD nfsd_layout_recall_cb: "
+ "nfs4_file not found\n");
+ goto err;
+ }
+ if (cbl->cbl_recall_type == RETURN_FSID)
+ cbl->cbl_fsid = lrfile->fi_fsid;
+ }
+
+ INIT_LIST_HEAD(&todolist);
+
+ /* If no cookie provided by FS, return a default one */
+ if (!cbl->cbl_cookie)
+ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
+
+ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile);
+ if (list_empty(&todolist)) {
+ status = -ENOENT;
+ } else {
+ /* process todolist even if create_layout_recall_list
+ * returned an error */
+ int status2 = spawn_layout_recall(sb, &todolist, todo_len);
+ if (status2)
+ status = status2;
+ }
+
+err:
+ nfs4_unlock_state();
+ if (lrfile)
+ put_nfs4_file(lrfile);
+ return (todo_len && status) ? -EAGAIN : status;
+}
+
+struct create_device_notify_list_arg {
+ struct list_head *todolist;
+ struct nfsd4_pnfs_cb_dev_list *ndl;
+};
+
+static int
+create_device_notify_per_cl(struct nfs4_client *clp, void *p)
+{
+ struct nfs4_notify_device *cbnd;
+ struct create_device_notify_list_arg *arg = p;
+
+ if (atomic_read(&clp->cl_deviceref) <= 0)
+ return 0;
+
+ cbnd = kzalloc(sizeof(*cbnd), GFP_KERNEL);
+ if (!cbnd)
+ return -ENOMEM;
+
+ cbnd->nd_list = arg->ndl;
+ cbnd->nd_client = clp;
+ INIT_WORK(&cbnd->nd_recall.cb_work, nfsd4_do_callback_rpc);
+ list_add(&cbnd->nd_perclnt, arg->todolist);
+ return 0;
+}
+
+/* Create a list of clients to send device notifications. */
+int
+create_device_notify_list(struct list_head *todolist,
+ struct nfsd4_pnfs_cb_dev_list *ndl)
+{
+ int status;
+ struct create_device_notify_list_arg arg = {
+ .todolist = todolist,
+ .ndl = ndl,
+ };
+
+ nfs4_lock_state();
+ status = filter_confirmed_clients(create_device_notify_per_cl, &arg);
+ nfs4_unlock_state();
+
+ return status;
+}
+
+/*
+ * For each client that a device, send a device notification.
+ * XXX: Need to track which clients have which devices.
+ */
+int nfsd_device_notify_cb(struct super_block *sb,
+ struct nfsd4_pnfs_cb_dev_list *ndl)
+{
+ struct nfs4_notify_device *cbnd;
+ struct nfs4_client *nd_client;
+ unsigned int notify_num = 0;
+ int status = 0;
+ struct list_head todolist;
+
+ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list);
+
+ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len);
+
+ if (nfsd_serv == NULL)
+ return -ENOENT;
+
+ INIT_LIST_HEAD(&todolist);
+
+ status = create_device_notify_list(&todolist, ndl);
+
+ while (!list_empty(&todolist)) {
+ cbnd = list_entry(todolist.next, struct nfs4_notify_device,
+ nd_perclnt);
+ list_del_init(&cbnd->nd_perclnt);
+ nd_client = cbnd->nd_client;
+ nfsd4_cb_notify_device(cbnd);
+ pnfs_clear_device_notify(nd_client);
+ notify_num++;
+ }
+
+ dprintk("NFSD %s: status %d clients %u\n",
+ __func__, status, notify_num);
+ return status;
+}
diff -up linux-2.6.38.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4pnfsdlm.c
--- linux-2.6.38.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2011-03-26 07:57:44.287821200 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfs4pnfsdlm.c 2011-03-26 07:57:44.287821200 -0400
@@ -0,0 +1,461 @@
+/******************************************************************************
+ *
+ * (c) 2007 Network Appliance, Inc. All Rights Reserved.
+ * (c) 2009 NetApp. All Rights Reserved.
+ *
+ * NetApp provides this source code under the GPL v2 License.
+ * The GPL v2 license is available at
+ * http://opensource.org/licenses/gpl-license.php.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <linux/nfs4.h>
+#include <linux/nfsd/const.h>
+#include <linux/nfsd/debug.h>
+#include <linux/nfsd/nfs4pnfsdlm.h>
+#include <linux/nfsd/nfs4layoutxdr.h>
+#include <linux/sunrpc/clnt.h>
+
+#include "nfsfh.h"
+#include "nfsd.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+/* Just use a linked list. Do not expect more than 32 dlm_device_entries
+ * the first implementation will just use one device per cluster file system
+ */
+
+static LIST_HEAD(dlm_device_list);
+static DEFINE_SPINLOCK(dlm_device_list_lock);
+
+struct dlm_device_entry {
+ struct list_head dlm_dev_list;
+ char disk_name[DISK_NAME_LEN];
+ int num_ds;
+ char ds_list[NFSD_DLM_DS_LIST_MAX];
+};
+
+static struct dlm_device_entry *
+_nfsd4_find_pnfs_dlm_device(char *disk_name)
+{
+ struct dlm_device_entry *dlm_pdev;
+
+ dprintk("--> %s disk name %s\n", __func__, disk_name);
+ spin_lock(&dlm_device_list_lock);
+ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) {
+ dprintk("%s Look for dlm_pdev %s\n", __func__,
+ dlm_pdev->disk_name);
+ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) {
+ spin_unlock(&dlm_device_list_lock);
+ return dlm_pdev;
+ }
+ }
+ spin_unlock(&dlm_device_list_lock);
+ return NULL;
+}
+
+static struct dlm_device_entry *
+nfsd4_find_pnfs_dlm_device(struct super_block *sb) {
+ char dname[BDEVNAME_SIZE];
+
+ bdevname(sb->s_bdev, dname);
+ return _nfsd4_find_pnfs_dlm_device(dname);
+}
+
+ssize_t
+nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen)
+{
+ char *pos = buf;
+ ssize_t size = 0;
+ struct dlm_device_entry *dlm_pdev;
+ int ret = -EINVAL;
+
+ spin_lock(&dlm_device_list_lock);
+ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list)
+ {
+ int advanced;
+ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list);
+ if (advanced >= buflen - size)
+ goto out;
+ size += advanced;
+ pos += advanced;
+ }
+ ret = size;
+
+out:
+ spin_unlock(&dlm_device_list_lock);
+ return ret;
+}
+
+bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds)
+{
+ char *start = ds_list;
+
+ *num_ds = 0;
+
+ while (*start) {
+ struct sockaddr_storage tempAddr;
+ int ipLen = strcspn(start, ",");
+
+ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr)))
+ return false;
+ (*num_ds)++;
+ start += ipLen + 1;
+ }
+ return true;
+}
+
+/*
+ * pnfs_dlm_device string format:
+ * block-device-path:<ds1 ipv4 address>,<ds2 ipv4 address>
+ *
+ * Examples
+ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with
+ * two data servers for the dlm cluster file system mounted on /dev/sda.
+ *
+ * /dev/sda:192.168.1.96,192.168.1.100'
+ * replaces the data server list for /dev/sda
+ *
+ * Only the deviceid == 1 is supported. Can add device id to
+ * pnfs_dlm_device string when needed.
+ *
+ * Only the round robin each data server once stripe index is supported.
+ */
+int
+nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len)
+
+{
+ struct dlm_device_entry *new, *found;
+ char *bufp = pnfs_dlm_device;
+ char *endp = bufp + strlen(bufp);
+ int err = -ENOMEM;
+
+ dprintk("--> %s len %d\n", __func__, len);
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return err;
+
+ err = -EINVAL;
+ /* disk_name */
+ /* FIXME: need to check for valid disk_name. search superblocks?
+ * check for slash dev slash ?
+ */
+ len = strcspn(bufp, ":");
+ if (len > DISK_NAME_LEN)
+ goto out_free;
+ memcpy(new->disk_name, bufp, len);
+
+ err = -EINVAL;
+ bufp += len + 1;
+ if (bufp >= endp)
+ goto out_free;
+
+ /* data server list */
+ /* FIXME: need to check for comma separated valid ip format */
+ len = strcspn(bufp, ":");
+ if (len > NFSD_DLM_DS_LIST_MAX)
+ goto out_free;
+ memcpy(new->ds_list, bufp, len);
+
+
+ /* validate the ips */
+ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds)))
+ goto out_free;
+
+ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__,
+ new->disk_name, new->num_ds, new->ds_list);
+
+ found = _nfsd4_find_pnfs_dlm_device(new->disk_name);
+ if (found) {
+ /* FIXME: should compare found->ds_list with new->ds_list
+ * and if it is different, kick off a CB_NOTIFY change
+ * deviceid.
+ */
+ dprintk("%s pnfs_dlm_device %s:%s already in cache "
+ " replace ds_list with new ds_list %s\n", __func__,
+ found->disk_name, found->ds_list, new->ds_list);
+ memset(found->ds_list, 0, DISK_NAME_LEN);
+ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list));
+ found->num_ds = new->num_ds;
+ kfree(new);
+ } else {
+ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__,
+ new->disk_name, new->ds_list);
+ spin_lock(&dlm_device_list_lock);
+ list_add(&new->dlm_dev_list, &dlm_device_list);
+ spin_unlock(&dlm_device_list_lock);
+ }
+ dprintk("<-- %s Success\n", __func__);
+ return 0;
+
+out_free:
+ kfree(new);
+ dprintk("<-- %s returns %d\n", __func__, err);
+ return err;
+}
+
+void nfsd4_pnfs_dlm_shutdown(void)
+{
+ struct dlm_device_entry *dlm_pdev, *next;
+
+ dprintk("--> %s\n", __func__);
+
+ spin_lock(&dlm_device_list_lock);
+ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list,
+ dlm_dev_list) {
+ list_del(&dlm_pdev->dlm_dev_list);
+ kfree(dlm_pdev);
+ }
+ spin_unlock(&dlm_device_list_lock);
+}
+
+static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb,
+ u32 layout_type,
+ struct nfsd4_pnfs_dev_iter_res *res)
+{
+ if (layout_type != LAYOUT_NFSV4_1_FILES) {
+ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' "
+ "(type: %x)\n", __func__, layout_type);
+ return -ENOTSUPP;
+ }
+
+ res->gd_eof = 1;
+ if (res->gd_cookie)
+ return -ENOENT;
+
+ res->gd_cookie = 1;
+ res->gd_verf = 1;
+ res->gd_devid = 1;
+ return 0;
+}
+
+static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb,
+ struct exp_xdr_stream *xdr,
+ u32 layout_type,
+ const struct nfsd4_pnfs_deviceid *devid)
+{
+ int err, len, i = 0;
+ struct pnfs_filelayout_device fdev;
+ struct pnfs_filelayout_devaddr *daddr;
+ struct dlm_device_entry *dlm_pdev;
+ char *bufp;
+
+ err = -ENOTSUPP;
+ if (layout_type != LAYOUT_NFSV4_1_FILES) {
+ dprintk("%s: ERROR: layout type isn't 'file' "
+ "(type: %x)\n", __func__, layout_type);
+ return err;
+ }
+
+ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO
+ * with a gdia_device_id != 1 is invalid.
+ */
+ err = -EINVAL;
+ if (devid->devid != 1) {
+ dprintk("%s: WARNING: didn't receive a deviceid of "
+ "1 (got: 0x%llx)\n", __func__, devid->devid);
+ return err;
+ }
+
+ /*
+ * If the DS list has not been established, return -EINVAL
+ */
+ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb);
+ if (!dlm_pdev) {
+ dprintk("%s: DEBUG: disk %s Not Found\n", __func__,
+ sb->s_bdev->bd_disk->disk_name);
+ return err;
+ }
+
+ dprintk("%s: Found disk %s with DS list |%s|\n",
+ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list);
+
+ memset(&fdev, '\0', sizeof(fdev));
+ fdev.fl_device_length = dlm_pdev->num_ds;
+
+ err = -ENOMEM;
+ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length;
+ fdev.fl_device_list = kzalloc(len, GFP_KERNEL);
+ if (!fdev.fl_device_list) {
+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list "
+ "buffer for %d DSes.\n", __func__, i);
+ fdev.fl_device_length = 0;
+ goto out;
+ }
+
+ /* Set a simple stripe indicie */
+ fdev.fl_stripeindices_length = fdev.fl_device_length;
+ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) *
+ fdev.fl_stripeindices_length, GFP_KERNEL);
+
+ if (!fdev.fl_stripeindices_list) {
+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices "
+ "list buffer for %d DSes.\n", __func__, i);
+ goto out;
+ }
+ for (i = 0; i < fdev.fl_stripeindices_length; i++)
+ fdev.fl_stripeindices_list[i] = i;
+
+ /* Transfer the data server list with a single multipath entry */
+ bufp = dlm_pdev->ds_list;
+ for (i = 0; i < fdev.fl_device_length; i++) {
+ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL);
+ if (!daddr) {
+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device "
+ "addr buffer.\n", __func__);
+ goto out;
+ }
+
+ daddr->r_netid.data = "tcp";
+ daddr->r_netid.len = 3;
+
+ len = strcspn(bufp, ",");
+ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL);
+ memcpy(daddr->r_addr.data, bufp, len);
+ /*
+ * append the port number. interpreted as two more bytes
+ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049.
+ */
+ memcpy(daddr->r_addr.data + len, ".8.1", 4);
+ daddr->r_addr.len = len + 4;
+
+ fdev.fl_device_list[i].fl_multipath_length = 1;
+ fdev.fl_device_list[i].fl_multipath_list = daddr;
+
+ dprintk("%s: encoding DS |%s|\n", __func__, bufp);
+
+ bufp += len + 1;
+ }
+
+ /* have nfsd encode the device info */
+ err = filelayout_encode_devinfo(xdr, &fdev);
+out:
+ for (i = 0; i < fdev.fl_device_length; i++)
+ kfree(fdev.fl_device_list[i].fl_multipath_list);
+ kfree(fdev.fl_device_list);
+ kfree(fdev.fl_stripeindices_list);
+ dprintk("<-- %s returns %d\n", __func__, err);
+ return err;
+}
+
+static int get_stripe_unit(int blocksize)
+{
+ if (blocksize >= NFSSVC_MAXBLKSIZE)
+ return blocksize;
+ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
+}
+
+/*
+ * Look up inode block device in pnfs_dlm_device list.
+ * Hash on the inode->i_ino and number of data servers.
+ */
+static int dlm_ino_hash(struct inode *ino)
+{
+ struct dlm_device_entry *de;
+ u32 hash_mask = 0;
+
+ /* If can't find the inode block device in the pnfs_dlm_deivce list
+ * then don't hand out a layout
+ */
+ de = nfsd4_find_pnfs_dlm_device(ino->i_sb);
+ if (!de)
+ return -1;
+ hash_mask = de->num_ds - 1;
+ return ino->i_ino & hash_mask;
+}
+
+static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode,
+ struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_layoutget_arg *args,
+ struct nfsd4_pnfs_layoutget_res *res)
+{
+ struct pnfs_filelayout_layout *layout = NULL;
+ struct knfsd_fh *fhp = NULL;
+ int index;
+ enum nfsstat4 rc = NFS4_OK;
+
+ dprintk("%s: LAYOUT_GET\n", __func__);
+
+ /* DLM exported file systems only support layouts for READ */
+ if (res->lg_seg.iomode == IOMODE_RW)
+ return NFS4ERR_BADIOMODE;
+
+ index = dlm_ino_hash(inode);
+ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index,
+ inode->i_ino);
+ if (index < 0)
+ return NFS4ERR_LAYOUTUNAVAILABLE;
+
+ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
+ /* Always give out whole file layouts */
+ res->lg_seg.offset = 0;
+ res->lg_seg.length = NFS4_MAX_UINT64;
+ /* Always give out READ ONLY layouts */
+ res->lg_seg.iomode = IOMODE_READ;
+
+ layout = kzalloc(sizeof(*layout), GFP_KERNEL);
+ if (layout == NULL) {
+ rc = NFS4ERR_LAYOUTTRYLATER;
+ goto error;
+ }
+
+ /* Set file layout response args */
+ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
+ layout->lg_stripe_type = STRIPE_SPARSE;
+ layout->lg_commit_through_mds = false;
+ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
+ layout->lg_fh_length = 1;
+ layout->device_id.sbid = args->lg_sbid;
+ layout->device_id.devid = 1; /*FSFTEMP*/
+ layout->lg_first_stripe_index = index; /*FSFTEMP*/
+ layout->lg_pattern_offset = 0;
+
+ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
+ if (fhp == NULL) {
+ rc = NFS4ERR_LAYOUTTRYLATER;
+ goto error;
+ }
+
+ memcpy(fhp, args->lg_fh, sizeof(*fhp));
+ pnfs_fh_mark_ds(fhp);
+ layout->lg_fh_list = fhp;
+
+ /* Call nfsd to encode layout */
+ rc = filelayout_encode_layout(xdr, layout);
+exit:
+ kfree(layout);
+ kfree(fhp);
+ return rc;
+
+error:
+ res->lg_seg.length = 0;
+ goto exit;
+}
+
+static int
+nfsd4_pnfs_dlm_layouttype(struct super_block *sb)
+{
+ return LAYOUT_NFSV4_1_FILES;
+}
+
+/* For use by DLM cluster file systems exported by pNFSD */
+const struct pnfs_export_operations pnfs_dlm_export_ops = {
+ .layout_type = nfsd4_pnfs_dlm_layouttype,
+ .get_device_info = nfsd4_pnfs_dlm_getdevinfo,
+ .get_device_iter = nfsd4_pnfs_dlm_getdeviter,
+ .layout_get = nfsd4_pnfs_dlm_layoutget,
+};
+EXPORT_SYMBOL(pnfs_dlm_export_ops);
diff -up linux-2.6.38.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4pnfsds.c
--- linux-2.6.38.noarch/fs/nfsd/nfs4pnfsds.c.orig 2011-03-26 07:57:44.288821192 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfs4pnfsds.c 2011-03-26 07:57:44.288821192 -0400
@@ -0,0 +1,620 @@
+/*
+* linux/fs/nfsd/nfs4pnfsds.c
+*
+* Copyright (c) 2005 The Regents of the University of Michigan.
+* All rights reserved.
+*
+* Andy Adamson <andros@umich.edu>
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* 1. Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the distribution.
+* 3. Neither the name of the University nor the names of its
+* contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if defined(CONFIG_PNFSD)
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+#include <linux/param.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/nfs4.h>
+#include <linux/exportfs.h>
+#include <linux/sched.h>
+
+#include "nfsd.h"
+#include "pnfsd.h"
+#include "state.h"
+
+/*
+ *******************
+ * PNFS
+ *******************
+ */
+/*
+ * Hash tables for pNFS Data Server state
+ *
+ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using
+ * this data server (DS).
+ *
+ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained
+ * from any MDS.
+ *
+ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained
+ * from any MDS.
+ *
+ */
+/* Hash tables for clientid state */
+#define CLIENT_HASH_BITS 4
+#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
+#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
+
+#define clientid_hashval(id) \
+ ((id) & CLIENT_HASH_MASK)
+
+/* hash table for pnfs_ds_stateid */
+#define STATEID_HASH_BITS 10
+#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
+#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1)
+
+#define stateid_hashval(owner_id, file_id) \
+ (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+
+static struct list_head mds_id_tbl;
+static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE];
+static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE];
+
+static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp);
+static inline void put_ds_mdsid(struct pnfs_mds_id *mdp);
+
+/* Mutex for data server state. Needs to be separate from
+ * mds state mutex since a node can be both mds and ds */
+static DEFINE_MUTEX(ds_mutex);
+static struct thread_info *ds_mutex_owner;
+
+static void
+ds_lock_state(void)
+{
+ mutex_lock(&ds_mutex);
+ ds_mutex_owner = current_thread_info();
+}
+
+static void
+ds_unlock_state(void)
+{
+ BUG_ON(ds_mutex_owner != current_thread_info());
+ ds_mutex_owner = NULL;
+ mutex_unlock(&ds_mutex);
+}
+
+static int
+cmp_clid(const clientid_t *cl1, const clientid_t *cl2)
+{
+ return (cl1->cl_boot == cl2->cl_boot) &&
+ (cl1->cl_id == cl2->cl_id);
+}
+
+void
+nfs4_pnfs_state_init(void)
+{
+ int i;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&mds_clid_hashtbl[i]);
+
+ for (i = 0; i < STATEID_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&ds_stid_hashtbl[i]);
+
+ INIT_LIST_HEAD(&mds_id_tbl);
+}
+
+static struct pnfs_mds_id *
+find_pnfs_mds_id(u32 mdsid)
+{
+ struct pnfs_mds_id *local = NULL;
+
+ dprintk("pNFSD: %s\n", __func__);
+ list_for_each_entry(local, &mds_id_tbl, di_hash) {
+ if (local->di_mdsid == mdsid)
+ return local;
+ }
+ return NULL;
+}
+
+static struct pnfs_ds_clientid *
+find_pnfs_ds_clientid(const clientid_t *clid)
+{
+ struct pnfs_ds_clientid *local = NULL;
+ unsigned int hashval;
+
+ dprintk("pNFSD: %s\n", __func__);
+
+ hashval = clientid_hashval(clid->cl_id);
+ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) {
+ if (cmp_clid(&local->dc_mdsclid, clid))
+ return local;
+ }
+ return NULL;
+}
+
+static struct pnfs_ds_stateid *
+find_pnfs_ds_stateid(stateid_t *stid)
+{
+ struct pnfs_ds_stateid *local = NULL;
+ u32 st_id = stid->si_stateownerid;
+ u32 f_id = stid->si_fileid;
+ unsigned int hashval;
+
+ dprintk("pNFSD: %s\n", __func__);
+
+ hashval = stateid_hashval(st_id, f_id);
+ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash)
+ if ((local->ds_stid.si_stateownerid == st_id) &&
+ (local->ds_stid.si_fileid == f_id) &&
+ (local->ds_stid.si_boot == stid->si_boot)) {
+ stateid_t *sid = &local->ds_stid;
+ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n",
+ __func__, local, local->ds_flags,
+ STATEID_VAL(sid));
+ return local;
+ }
+ return NULL;
+}
+
+static void
+release_ds_mdsid(struct kref *kref)
+{
+ struct pnfs_mds_id *mdp =
+ container_of(kref, struct pnfs_mds_id, di_ref);
+ dprintk("pNFSD: %s\n", __func__);
+
+ list_del(&mdp->di_hash);
+ list_del(&mdp->di_mdsclid);
+ kfree(mdp);
+}
+
+static void
+release_ds_clientid(struct kref *kref)
+{
+ struct pnfs_ds_clientid *dcp =
+ container_of(kref, struct pnfs_ds_clientid, dc_ref);
+ struct pnfs_mds_id *mdp;
+ dprintk("pNFSD: %s\n", __func__);
+
+ mdp = find_pnfs_mds_id(dcp->dc_mdsid);
+ if (mdp)
+ put_ds_mdsid(mdp);
+
+ list_del(&dcp->dc_hash);
+ list_del(&dcp->dc_stateid);
+ list_del(&dcp->dc_permdsid);
+ kfree(dcp);
+}
+
+static void
+release_ds_stateid(struct kref *kref)
+{
+ struct pnfs_ds_stateid *dsp =
+ container_of(kref, struct pnfs_ds_stateid, ds_ref);
+ struct pnfs_ds_clientid *dcp;
+ dprintk("pNFS %s: dsp %p\n", __func__, dsp);
+
+ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid);
+ if (dcp)
+ put_ds_clientid(dcp);
+
+ list_del(&dsp->ds_hash);
+ list_del(&dsp->ds_perclid);
+ kfree(dsp);
+}
+
+static inline void
+put_ds_clientid(struct pnfs_ds_clientid *dcp)
+{
+ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
+ atomic_read(&dcp->dc_ref.refcount));
+ kref_put(&dcp->dc_ref, release_ds_clientid);
+}
+
+static inline void
+get_ds_clientid(struct pnfs_ds_clientid *dcp)
+{
+ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
+ atomic_read(&dcp->dc_ref.refcount));
+ kref_get(&dcp->dc_ref);
+}
+
+static inline void
+put_ds_mdsid(struct pnfs_mds_id *mdp)
+{
+ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
+ atomic_read(&mdp->di_ref.refcount));
+ kref_put(&mdp->di_ref, release_ds_mdsid);
+}
+
+static inline void
+get_ds_mdsid(struct pnfs_mds_id *mdp)
+{
+ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
+ atomic_read(&mdp->di_ref.refcount));
+ kref_get(&mdp->di_ref);
+}
+
+static inline void
+put_ds_stateid(struct pnfs_ds_stateid *dsp)
+{
+ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
+ atomic_read(&dsp->ds_ref.refcount));
+ kref_put(&dsp->ds_ref, release_ds_stateid);
+}
+
+static inline void
+get_ds_stateid(struct pnfs_ds_stateid *dsp)
+{
+ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
+ atomic_read(&dsp->ds_ref.refcount));
+ kref_get(&dsp->ds_ref);
+}
+
+void
+nfs4_pnfs_state_shutdown(void)
+{
+ struct pnfs_ds_stateid *dsp;
+ int i;
+
+ dprintk("pNFSD %s: -->\n", __func__);
+
+ ds_lock_state();
+ for (i = 0; i < STATEID_HASH_SIZE; i++) {
+ while (!list_empty(&ds_stid_hashtbl[i])) {
+ dsp = list_entry(ds_stid_hashtbl[i].next,
+ struct pnfs_ds_stateid, ds_hash);
+ put_ds_stateid(dsp);
+ }
+ }
+ ds_unlock_state();
+}
+
+static struct pnfs_mds_id *
+alloc_init_mds_id(struct pnfs_get_state *gsp)
+{
+ struct pnfs_mds_id *mdp;
+
+ dprintk("pNFSD: %s\n", __func__);
+
+ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL);
+ if (!mdp)
+ return NULL;
+ INIT_LIST_HEAD(&mdp->di_hash);
+ INIT_LIST_HEAD(&mdp->di_mdsclid);
+ list_add(&mdp->di_hash, &mds_id_tbl);
+ mdp->di_mdsid = gsp->dsid;
+ mdp->di_mdsboot = 0;
+ kref_init(&mdp->di_ref);
+ return mdp;
+}
+
+static struct pnfs_ds_clientid *
+alloc_init_ds_clientid(struct pnfs_get_state *gsp)
+{
+ struct pnfs_mds_id *mdp;
+ struct pnfs_ds_clientid *dcp;
+ clientid_t *clid = (clientid_t *)&gsp->clid;
+ unsigned int hashval = clientid_hashval(clid->cl_id);
+
+ dprintk("pNFSD: %s\n", __func__);
+
+ mdp = find_pnfs_mds_id(gsp->dsid);
+ if (!mdp) {
+ mdp = alloc_init_mds_id(gsp);
+ if (!mdp)
+ return NULL;
+ } else {
+ get_ds_mdsid(mdp);
+ }
+
+ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL);
+ if (!dcp)
+ return NULL;
+
+ INIT_LIST_HEAD(&dcp->dc_hash);
+ INIT_LIST_HEAD(&dcp->dc_stateid);
+ INIT_LIST_HEAD(&dcp->dc_permdsid);
+ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]);
+ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid);
+ dcp->dc_mdsclid = *clid;
+ kref_init(&dcp->dc_ref);
+ dcp->dc_mdsid = gsp->dsid;
+ return dcp;
+}
+
+static struct pnfs_ds_stateid *
+alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp)
+{
+ struct pnfs_ds_stateid *dsp;
+ u32 st_id = stidp->si_stateownerid;
+ u32 f_id = stidp->si_fileid;
+ unsigned int hashval;
+
+ dprintk("pNFSD: %s\n", __func__);
+
+ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL);
+ if (!dsp)
+ return dsp;
+
+ INIT_LIST_HEAD(&dsp->ds_hash);
+ INIT_LIST_HEAD(&dsp->ds_perclid);
+ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t));
+ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle);
+ dsp->ds_access = 0;
+ dsp->ds_status = 0;
+ dsp->ds_flags = 0L;
+ kref_init(&dsp->ds_ref);
+ set_bit(DS_STATEID_NEW, &dsp->ds_flags);
+ clear_bit(DS_STATEID_VALID, &dsp->ds_flags);
+ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
+ init_waitqueue_head(&dsp->ds_waitq);
+
+ hashval = stateid_hashval(st_id, f_id);
+ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]);
+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
+ return dsp;
+}
+
+static int
+update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh,
+ struct pnfs_get_state *gsp)
+{
+ struct pnfs_ds_clientid *dcp;
+ int new = 0;
+
+ dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
+
+ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid);
+ if (!dcp) {
+ dcp = alloc_init_ds_clientid(gsp);
+ if (!dcp)
+ return 1;
+ new = 1;
+ }
+ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) {
+ list_add(&dsp->ds_perclid, &dcp->dc_stateid);
+ if (!new)
+ get_ds_clientid(dcp);
+ }
+
+ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t));
+ dsp->ds_access = gsp->access;
+ dsp->ds_status = 0;
+ dsp->ds_verifier[0] = gsp->verifier[0];
+ dsp->ds_verifier[1] = gsp->verifier[1];
+ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t));
+ set_bit(DS_STATEID_VALID, &dsp->ds_flags);
+ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
+ clear_bit(DS_STATEID_NEW, &dsp->ds_flags);
+ return 0;
+}
+
+int
+nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs)
+{
+ stateid_t *stid = (stateid_t *)&gs->stid;
+ struct pnfs_ds_stateid *dsp;
+
+ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__,
+ STATEID_VAL(stid));
+
+ ds_lock_state();
+ dsp = find_pnfs_ds_stateid(stid);
+ if (dsp)
+ put_ds_stateid(dsp);
+ ds_unlock_state();
+
+ dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
+
+ if (dsp)
+ return 0;
+ return -ENOENT;
+}
+
+/* Retrieves and validates stateid.
+ * If stateid exists and its fields match, return it.
+ * If stateid exists but either the generation or
+ * ownerids don't match, check with mds to see if it is valid.
+ * If the stateid doesn't exist, the first thread creates a
+ * invalid *marker* stateid, then checks to see if the
+ * stateid exists on the mds. If so, it validates the *marker*
+ * stateid and updates its fields. Subsequent threads that
+ * find the *marker* stateid wait until it is valid or an error
+ * occurs.
+ * Called with ds_state_lock.
+ */
+static struct pnfs_ds_stateid *
+nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp)
+{
+ struct inode *ino = cfh->fh_dentry->d_inode;
+ struct super_block *sb;
+ struct pnfs_ds_stateid *dsp = NULL;
+ struct pnfs_get_state gs = {
+ .access = 0,
+ };
+ int status = 0, waiter = 0;
+
+ dprintk("pNFSD: %s -->\n", __func__);
+
+ dsp = find_pnfs_ds_stateid(stidp);
+ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) &&
+ (stidp->si_generation == dsp->ds_stid.si_generation))
+ goto out_noput;
+
+ sb = ino->i_sb;
+ if (!sb || !sb->s_pnfs_op->get_state)
+ goto out_noput;
+
+ /* Uninitialize current state if it exists yet it doesn't match.
+ * If it is already invalid, another thread is checking state */
+ if (dsp) {
+ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags))
+ waiter = 1;
+ } else {
+ dsp = alloc_init_ds_stateid(cfh, stidp);
+ if (!dsp)
+ goto out_noput;
+ }
+
+ dprintk("pNFSD: %s Starting loop\n", __func__);
+ get_ds_stateid(dsp);
+ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
+ ds_unlock_state();
+
+ /* Another thread is checking the state */
+ if (waiter) {
+ dprintk("pNFSD: %s waiting\n", __func__);
+ wait_event_interruptible_timeout(dsp->ds_waitq,
+ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) ||
+ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)),
+ msecs_to_jiffies(1024));
+ dprintk("pNFSD: %s awake\n", __func__);
+ ds_lock_state();
+ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
+ goto out;
+
+ continue;
+ }
+
+ /* Validate stateid on mds */
+ dprintk("pNFSD: %s Checking state on MDS\n", __func__);
+ memcpy(&gs.stid, stidp, sizeof(stateid_t));
+ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs);
+ dprintk("pNFSD: %s from MDS status %d\n", __func__, status);
+ ds_lock_state();
+ /* if !status and stateid is valid, update id and mark valid */
+ if (status || update_ds_stateid(dsp, cfh, &gs)) {
+ set_bit(DS_STATEID_ERROR, &dsp->ds_flags);
+ /* remove invalid stateid from list */
+ put_ds_stateid(dsp);
+ wake_up(&dsp->ds_waitq);
+ goto out;
+ }
+
+ wake_up(&dsp->ds_waitq);
+ }
+out:
+ if (dsp)
+ put_ds_stateid(dsp);
+out_noput:
+ if (dsp)
+ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n",
+ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid));
+ /* If error, return null */
+ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
+ dsp = NULL;
+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
+ return dsp;
+}
+
+int
+nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid)
+{
+ struct pnfs_ds_stateid *dsp;
+ int status = 0;
+
+ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__,
+ STATEID_VAL(stateid));
+
+ /* Must release state lock while verifying stateid on mds */
+ nfs4_unlock_state();
+ ds_lock_state();
+ dsp = nfsv4_ds_get_state(cfh, stateid);
+ if (dsp) {
+ get_ds_stateid(dsp);
+ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__,
+ STATEID_VAL(&dsp->ds_stid));
+
+ dprintk("NFSD: %s: dsp %p fh_size %u:%u "
+ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] "
+ "gen %x:%x\n",
+ __func__, dsp,
+ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size,
+ ((unsigned *)&cfh->fh_handle.fh_base)[0],
+ ((unsigned *)&cfh->fh_handle.fh_base)[1],
+ ((unsigned *)&cfh->fh_handle.fh_base)[2],
+ ((unsigned *)&cfh->fh_handle.fh_base)[3],
+ ((unsigned *)&dsp->ds_fh.fh_base)[0],
+ ((unsigned *)&dsp->ds_fh.fh_base)[1],
+ ((unsigned *)&dsp->ds_fh.fh_base)[2],
+ ((unsigned *)&dsp->ds_fh.fh_base)[3],
+ stateid->si_generation, dsp->ds_stid.si_generation);
+ }
+
+ if (!dsp ||
+ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) ||
+ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base,
+ dsp->ds_fh.fh_size) != 0) ||
+ (stateid->si_generation > dsp->ds_stid.si_generation))
+ status = nfserr_bad_stateid;
+ else if (stateid->si_generation < dsp->ds_stid.si_generation)
+ status = nfserr_old_stateid;
+
+ if (dsp)
+ put_ds_stateid(dsp);
+ ds_unlock_state();
+ nfs4_lock_state();
+ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status));
+ return status;
+}
+
+void
+nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p)
+{
+ struct pnfs_ds_stateid *dsp = NULL;
+
+ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid);
+
+ ds_lock_state();
+ if (stateid != NULL) {
+ dsp = find_pnfs_ds_stateid(stateid);
+ if (dsp)
+ get_ds_stateid(dsp);
+ }
+
+ /* XXX: Should we fetch the stateid or wait if some other
+ * thread is currently retrieving the stateid ? */
+ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
+ *p++ = dsp->ds_verifier[0];
+ *p++ = dsp->ds_verifier[1];
+ put_ds_stateid(dsp);
+ } else {
+ /* must be on MDS */
+ ds_unlock_state();
+ sb->s_pnfs_op->get_verifier(sb, p);
+ ds_lock_state();
+ p += 2;
+ }
+ ds_unlock_state();
+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
+ return;
+}
+
+#endif /* CONFIG_PNFSD */
diff -up linux-2.6.38.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4proc.c
--- linux-2.6.38.noarch/fs/nfsd/nfs4proc.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfs4proc.c 2011-03-26 07:57:44.289821184 -0400
@@ -34,10 +34,14 @@
*/
#include <linux/file.h>
#include <linux/slab.h>
+#include <linux/nfsd/nfs4layoutxdr.h>
+#include <linux/nfsd4_spnfs.h>
+#include <linux/nfsd4_block.h>
#include "cache.h"
#include "xdr4.h"
#include "vfs.h"
+#include "pnfsd.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc
* set, (2) sets open->op_stateid, (3) sets open->op_delegation.
*/
status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+#if defined(CONFIG_SPNFS)
+ if (!status && spnfs_enabled()) {
+ struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
+
+ status = spnfs_open(inode, open);
+ if (status) {
+ dprintk(
+ "nfsd: pNFS could not be enabled for inode: %lu\n",
+ inode->i_ino);
+ /*
+ * XXX When there's a failure then need to indicate to
+ * future ops that no pNFS is available. Should I save
+ * the status in the inode? It's kind of a big hammer.
+ * But there may be no stripes available?
+ */
+ }
+ }
+#endif /* CONFIG_SPNFS */
out:
if (open->op_stateowner) {
nfs4_get_stateowner(open->op_stateowner);
@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str
&access->ac_supported);
}
+static void
+nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf)
+{
+ u32 *p = (u32 *)verf->data;
+
+#if defined(CONFIG_PNFSD)
+ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) {
+ nfs4_ds_get_verifier(NULL, sb, p);
+ return;
+ }
+#endif /* CONFIG_PNFSD */
+
+ *p++ = nfssvc_boot.tv_sec;
+ *p++ = nfssvc_boot.tv_usec;
+}
+
static __be32
nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_commit *commit)
{
__be32 status;
- u32 *p = (u32 *)commit->co_verf.data;
- *p++ = nfssvc_boot.tv_sec;
- *p++ = nfssvc_boot.tv_usec;
-
+ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
+ &commit->co_verf);
status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
commit->co_count);
if (status == nfserr_symlink)
@@ -846,7 +882,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru
{
stateid_t *stateid = &write->wr_stateid;
struct file *filp = NULL;
- u32 *p;
__be32 status = nfs_ok;
unsigned long cnt;
@@ -868,13 +903,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru
cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
- p = (u32 *)write->wr_verifier.data;
- *p++ = nfssvc_boot.tv_sec;
- *p++ = nfssvc_boot.tv_usec;
+ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
+ &write->wr_verifier);
+#if defined(CONFIG_SPNFS)
+#if defined(CONFIG_SPNFS_BLOCK)
+ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) {
+ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode,
+ RETURN_FILE, write->wr_offset, write->wr_buflen);
+ if (!status) {
+ status = nfsd_write(rqstp, &cstate->current_fh, filp,
+ write->wr_offset, rqstp->rq_vec, write->wr_vlen,
+ &cnt, &write->wr_how_written);
+ }
+ } else
+#endif
+
+ if (spnfs_enabled()) {
+ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode,
+ write->wr_offset, write->wr_buflen, write->wr_vlen,
+ rqstp);
+ if (status == nfs_ok) {
+ /* DMXXX: HACK to get filesize set */
+ /* write one byte at offset+length-1 */
+ struct kvec k[1];
+ char zero = 0;
+ unsigned long cnt = 1;
+
+ k[0].iov_base = (void *)&zero;
+ k[0].iov_len = 1;
+ nfsd_write(rqstp, &cstate->current_fh, filp,
+ write->wr_offset+write->wr_buflen-1, k, 1,
+ &cnt, &write->wr_how_written);
+ }
+ } else /* we're not an MDS */
+ status = nfsd_write(rqstp, &cstate->current_fh, filp,
+ write->wr_offset, rqstp->rq_vec, write->wr_vlen,
+ &cnt, &write->wr_how_written);
+#else
status = nfsd_write(rqstp, &cstate->current_fh, filp,
write->wr_offset, rqstp->rq_vec, write->wr_vlen,
&cnt, &write->wr_how_written);
+#endif /* CONFIG_SPNFS */
+
if (filp)
fput(filp);
@@ -965,6 +1036,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str
return status == nfserr_same ? nfs_ok : status;
}
+#if defined(CONFIG_PNFSD)
+
+static __be32
+nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp,
+ unsigned int layout_type)
+{
+ int status, type;
+
+ /* check to see if pNFS is supported. */
+ status = nfserr_layoutunavailable;
+ if (exp && exp->ex_pnfs == 0) {
+ dprintk("%s: Underlying file system "
+ "is not exported over pNFS\n", __func__);
+ goto out;
+ }
+ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) {
+ dprintk("%s: Underlying file system "
+ "does not support pNFS\n", __func__);
+ goto out;
+ }
+
+ type = sb->s_pnfs_op->layout_type(sb);
+
+ /* check to see if requested layout type is supported. */
+ status = nfserr_unknown_layouttype;
+ if (!type)
+ dprintk("BUG: %s: layout_type 0 is reserved and must not be "
+ "used by filesystem\n", __func__);
+ else if (type != layout_type)
+ dprintk("%s: requested layout type %d "
+ "does not match supported type %d\n",
+ __func__, layout_type, type);
+ else
+ status = nfs_ok;
+out:
+ return status;
+}
+
+static __be32
+nfsd4_getdevlist(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_pnfs_getdevlist *gdlp)
+{
+ struct super_block *sb;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ int status;
+
+ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n",
+ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices,
+ gdlp->gd_cookie, gdlp->gd_verf);
+
+
+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+ if (status)
+ goto out;
+
+ status = nfserr_inval;
+ sb = current_fh->fh_dentry->d_inode->i_sb;
+ if (!sb)
+ goto out;
+
+ /* We must be able to encode at list one device */
+ if (!gdlp->gd_maxdevices)
+ goto out;
+
+ /* Ensure underlying file system supports pNFS and,
+ * if so, the requested layout type
+ */
+ status = nfsd4_layout_verify(sb, current_fh->fh_export,
+ gdlp->gd_layout_type);
+ if (status)
+ goto out;
+
+ /* Do nothing if underlying file system does not support
+ * getdevicelist */
+ if (!sb->s_pnfs_op->get_device_iter) {
+ status = nfserr_notsupp;
+ goto out;
+ }
+
+ /* Set up arguments so device can be retrieved at encode time */
+ gdlp->gd_fhp = &cstate->current_fh;
+out:
+ return status;
+}
+
+static __be32
+nfsd4_getdevinfo(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_pnfs_getdevinfo *gdp)
+{
+ struct super_block *sb;
+ int status;
+ clientid_t clid;
+
+ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n",
+ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid,
+ gdp->gd_devid.devid, gdp->gd_maxcount);
+
+ status = nfserr_inval;
+ sb = find_sbid_id(gdp->gd_devid.sbid);
+ dprintk("%s: sb %p\n", __func__, sb);
+ if (!sb) {
+ status = nfserr_noent;
+ goto out;
+ }
+
+ /* Ensure underlying file system supports pNFS and,
+ * if so, the requested layout type
+ */
+ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type);
+ if (status)
+ goto out;
+
+ /* Set up arguments so device can be retrieved at encode time */
+ gdp->gd_sb = sb;
+
+ /* Update notifications */
+ copy_clientid(&clid, cstate->session);
+ pnfs_set_device_notify(&clid, gdp->gd_notify_types);
+out:
+ return status;
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_pnfs_layoutget *lgp)
+{
+ int status;
+ struct super_block *sb;
+ struct svc_fh *current_fh = &cstate->current_fh;
+
+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+ if (status)
+ goto out;
+
+ status = nfserr_inval;
+ sb = current_fh->fh_dentry->d_inode->i_sb;
+ if (!sb)
+ goto out;
+
+ /* Ensure underlying file system supports pNFS and,
+ * if so, the requested layout type
+ */
+ status = nfsd4_layout_verify(sb, current_fh->fh_export,
+ lgp->lg_seg.layout_type);
+ if (status)
+ goto out;
+
+ status = nfserr_badiomode;
+ if (lgp->lg_seg.iomode != IOMODE_READ &&
+ lgp->lg_seg.iomode != IOMODE_RW) {
+ dprintk("pNFS %s: invalid iomode %d\n", __func__,
+ lgp->lg_seg.iomode);
+ goto out;
+ }
+
+ /* Set up arguments so layout can be retrieved at encode time */
+ lgp->lg_fhp = current_fh;
+ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session);
+ status = nfs_ok;
+out:
+ return status;
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_pnfs_layoutcommit *lcp)
+{
+ int status;
+ struct inode *ino = NULL;
+ struct iattr ia;
+ struct super_block *sb;
+ struct svc_fh *current_fh = &cstate->current_fh;
+
+ dprintk("NFSD: nfsd4_layoutcommit \n");
+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+ if (status)
+ goto out;
+
+ status = nfserr_inval;
+ ino = current_fh->fh_dentry->d_inode;
+ if (!ino)
+ goto out;
+
+ status = nfserr_inval;
+ sb = ino->i_sb;
+ if (!sb)
+ goto out;
+
+ /* Ensure underlying file system supports pNFS and,
+ * if so, the requested layout type
+ */
+ status = nfsd4_layout_verify(sb, current_fh->fh_export,
+ lcp->args.lc_seg.layout_type);
+ if (status)
+ goto out;
+
+ /* This will only extend the file length. Do a quick
+ * check to see if there is any point in waiting for the update
+ * locks.
+ * TODO: Is this correct for all back ends?
+ */
+ dprintk("%s:new offset: %d new size: %llu old size: %lld\n",
+ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1,
+ ino->i_size);
+
+ /* Set clientid from sessionid */
+ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session);
+ lcp->res.lc_size_chg = 0;
+ if (sb->s_pnfs_op->layout_commit) {
+ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res);
+ dprintk("%s:layout_commit result %d\n", __func__, status);
+ } else {
+ fh_lock(current_fh);
+ if ((lcp->args.lc_newoffset == 0) ||
+ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) {
+ status = 0;
+ lcp->res.lc_size_chg = 0;
+ fh_unlock(current_fh);
+ goto out;
+ }
+
+ /* Try our best to update the file size */
+ dprintk("%s: Modifying file size\n", __func__);
+ ia.ia_valid = ATTR_SIZE;
+ ia.ia_size = lcp->args.lc_last_wr + 1;
+ status = notify_change(current_fh->fh_dentry, &ia);
+ fh_unlock(current_fh);
+ dprintk("%s:notify_change result %d\n", __func__, status);
+ }
+
+ if (!status && lcp->res.lc_size_chg &&
+ EX_ISSYNC(current_fh->fh_export)) {
+ dprintk("%s: Synchronously writing inode size %llu\n",
+ __func__, ino->i_size);
+ write_inode_now(ino, 1);
+ lcp->res.lc_newsize = i_size_read(ino);
+ }
+out:
+ return status;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_pnfs_layoutreturn *lrp)
+{
+ int status;
+ struct super_block *sb;
+ struct svc_fh *current_fh = &cstate->current_fh;
+
+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+ if (status)
+ goto out;
+
+ status = nfserr_inval;
+ sb = current_fh->fh_dentry->d_inode->i_sb;
+ if (!sb)
+ goto out;
+
+ /* Ensure underlying file system supports pNFS and,
+ * if so, the requested layout type
+ */
+ status = nfsd4_layout_verify(sb, current_fh->fh_export,
+ lrp->args.lr_seg.layout_type);
+ if (status)
+ goto out;
+
+ status = nfserr_inval;
+ if (lrp->args.lr_return_type != RETURN_FILE &&
+ lrp->args.lr_return_type != RETURN_FSID &&
+ lrp->args.lr_return_type != RETURN_ALL) {
+ dprintk("pNFS %s: invalid return_type %d\n", __func__,
+ lrp->args.lr_return_type);
+ goto out;
+ }
+
+ status = nfserr_inval;
+ if (lrp->args.lr_seg.iomode != IOMODE_READ &&
+ lrp->args.lr_seg.iomode != IOMODE_RW &&
+ lrp->args.lr_seg.iomode != IOMODE_ANY) {
+ dprintk("pNFS %s: invalid iomode %d\n", __func__,
+ lrp->args.lr_seg.iomode);
+ goto out;
+ }
+
+ /* Set clientid from sessionid */
+ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session);
+ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE);
+ status = nfs4_pnfs_return_layout(sb, current_fh, lrp);
+out:
+ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n",
+ __func__, status, lrp->args.lr_return_type, lrp->lrs_present);
+ return status;
+}
+#endif /* CONFIG_PNFSD */
+
/*
* NULL call.
*/
@@ -1355,6 +1726,29 @@ static struct nfsd4_operation nfsd4_ops[
.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
.op_name = "OP_SECINFO_NO_NAME",
},
+#if defined(CONFIG_PNFSD)
+ [OP_GETDEVICELIST] = {
+ .op_func = (nfsd4op_func)nfsd4_getdevlist,
+ .op_name = "OP_GETDEVICELIST",
+ },
+ [OP_GETDEVICEINFO] = {
+ .op_func = (nfsd4op_func)nfsd4_getdevinfo,
+ .op_flags = ALLOWED_WITHOUT_FH,
+ .op_name = "OP_GETDEVICEINFO",
+ },
+ [OP_LAYOUTGET] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutget,
+ .op_name = "OP_LAYOUTGET",
+ },
+ [OP_LAYOUTCOMMIT] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutcommit,
+ .op_name = "OP_LAYOUTCOMMIT",
+ },
+ [OP_LAYOUTRETURN] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutreturn,
+ .op_name = "OP_LAYOUTRETURN",
+ },
+#endif /* CONFIG_PNFSD */
};
static const char *nfsd4_op_name(unsigned opnum)
diff -up linux-2.6.38.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4state.c
--- linux-2.6.38.noarch/fs/nfsd/nfs4state.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfs4state.c 2011-03-26 07:57:44.291821168 -0400
@@ -42,6 +42,8 @@
#include "xdr4.h"
#include "vfs.h"
+#include "pnfsd.h"
+
#define NFSDDBG_FACILITY NFSDDBG_PROC
/* Globals */
@@ -59,8 +61,6 @@ static u64 current_sessionid = 1;
#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
/* forward declarations */
-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
static void nfs4_set_recdir(char *recdir);
@@ -68,6 +68,7 @@ static void nfs4_set_recdir(char *recdir
/* Currently used for almost all code touching nfsv4 state: */
static DEFINE_MUTEX(client_mutex);
+struct task_struct *client_mutex_owner;
/*
* Currently used for the del_recall_lru and file hash table. In an
@@ -85,11 +86,21 @@ void
nfs4_lock_state(void)
{
mutex_lock(&client_mutex);
+ client_mutex_owner = current;
+}
+
+#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current)
+
+void
+nfs4_bug_on_unlocked_state(void)
+{
+ BUG_ON(client_mutex_owner != current);
}
void
nfs4_unlock_state(void)
{
+ client_mutex_owner = NULL;
mutex_unlock(&client_mutex);
}
@@ -108,7 +119,7 @@ opaque_hashval(const void *ptr, int nbyt
static struct list_head del_recall_lru;
-static inline void
+inline void
put_nfs4_file(struct nfs4_file *fi)
{
if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
@@ -119,7 +130,7 @@ put_nfs4_file(struct nfs4_file *fi)
}
}
-static inline void
+inline void
get_nfs4_file(struct nfs4_file *fi)
{
atomic_inc(&fi->fi_ref);
@@ -179,10 +190,16 @@ static void nfs4_file_get_access(struct
static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
{
- if (fp->fi_fds[oflag]) {
- fput(fp->fi_fds[oflag]);
- fp->fi_fds[oflag] = NULL;
- }
+ struct file *fd = fp->fi_fds[oflag];
+
+ if (!fd)
+ return;
+
+ fp->fi_fds[oflag] = NULL;
+ BUG_ON_UNLOCKED_STATE();
+ nfs4_unlock_state(); /* allow nested layout recall/return */
+ fput(fd);
+ nfs4_lock_state();
}
static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
@@ -295,8 +312,8 @@ static DEFINE_SPINLOCK(client_lock);
* reclaim_str_hashtbl[] holds known client info from previous reset/reboot
* used in reboot/reset lease grace period processing
*
- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
- * setclientid_confirmed info.
+ * conf_id_hashtbl[], and conf_str_hashtbl[] hold
+ * confirmed setclientid_confirmed info.
*
* unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed
* setclientid info.
@@ -321,6 +338,7 @@ static void unhash_generic_stateid(struc
list_del(&stp->st_hash);
list_del(&stp->st_perfile);
list_del(&stp->st_perstateowner);
+ release_pnfs_ds_dev_list(stp);
}
static void free_generic_stateid(struct nfs4_stateid *stp)
@@ -943,6 +961,8 @@ expire_client(struct nfs4_client *clp)
struct nfs4_delegation *dp;
struct list_head reaplist;
+ BUG_ON_UNLOCKED_STATE();
+
INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
while (!list_empty(&clp->cl_delegations)) {
@@ -960,6 +980,7 @@ expire_client(struct nfs4_client *clp)
sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
release_openowner(sop);
}
+ pnfs_expire_client(clp);
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -972,6 +993,13 @@ expire_client(struct nfs4_client *clp)
spin_unlock(&client_lock);
}
+void expire_client_lock(struct nfs4_client *clp)
+{
+ nfs4_lock_state();
+ expire_client(clp);
+ nfs4_unlock_state();
+}
+
static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
{
memcpy(target->cl_verifier.data, source->data,
@@ -1063,6 +1091,11 @@ static struct nfs4_client *create_client
INIT_LIST_HEAD(&clp->cl_strhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
+#if defined(CONFIG_PNFSD)
+ INIT_LIST_HEAD(&clp->cl_layouts);
+ INIT_LIST_HEAD(&clp->cl_layoutrecalls);
+ atomic_set(&clp->cl_deviceref, 0);
+#endif /* CONFIG_PNFSD */
INIT_LIST_HEAD(&clp->cl_lru);
INIT_LIST_HEAD(&clp->cl_callbacks);
spin_lock_init(&clp->cl_lock);
@@ -1114,7 +1147,7 @@ move_to_confirmed(struct nfs4_client *cl
renew_client(clp);
}
-static struct nfs4_client *
+struct nfs4_client *
find_confirmed_client(clientid_t *clid)
{
struct nfs4_client *clp;
@@ -1169,6 +1202,24 @@ find_unconfirmed_client_by_str(const cha
return NULL;
}
+int
+filter_confirmed_clients(int (* func)(struct nfs4_client *, void *),
+ void *arg)
+{
+ struct nfs4_client *clp, *next;
+ int i, status = 0;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
+ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i],
+ cl_strhash) {
+ status = func(clp, arg);
+ if (status)
+ break;
+ }
+
+ return status;
+}
+
static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
{
switch (family) {
@@ -1317,8 +1368,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co
static void
nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
{
- /* pNFS is not supported */
+#if defined(CONFIG_PNFSD)
+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS |
+ EXCHGID4_FLAG_USE_PNFS_DS;
+#else /* CONFIG_PNFSD */
new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif /* CONFIG_PNFSD */
/* Referrals are supported, Migration is not. */
new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -1514,6 +1569,13 @@ nfsd4_create_session(struct svc_rqst *rq
bool confirm_me = false;
int status = 0;
+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
+ /* XXX hack to get local ip address */
+ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local,
+ sizeof(pnfsd_lexp_addr));
+ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen;
+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
+
nfs4_lock_state();
unconf = find_unconfirmed_client(&cr_ses->clientid);
conf = find_confirmed_client(&cr_ses->clientid);
@@ -1549,6 +1611,9 @@ nfsd4_create_session(struct svc_rqst *rq
goto out;
}
+ if (is_ds_only_session(unconf->cl_exchange_flags))
+ cr_ses->flags &= ~SESSION4_BACK_CHAN;
+
confirm_me = true;
conf = unconf;
} else {
@@ -1791,8 +1856,14 @@ out:
nfsd4_get_session(cstate->session);
atomic_inc(&clp->cl_refcount);
- if (clp->cl_cb_state == NFSD4_CB_DOWN)
+ switch (clp->cl_cb_state) {
+ case NFSD4_CB_DOWN:
seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
+ break;
+ case NFSD4_CB_FAULT:
+ seq->status_flags |= SEQ4_STATUS_BACKCHANNEL_FAULT;
+ break;
+ }
}
kfree(conn);
spin_unlock(&client_lock);
@@ -2051,7 +2122,7 @@ out:
/* OPEN Share state helper functions */
static inline struct nfs4_file *
-alloc_init_file(struct inode *ino)
+alloc_init_file(struct inode *ino, struct svc_fh *current_fh)
{
struct nfs4_file *fp;
unsigned int hashval = file_hashval(ino);
@@ -2068,6 +2139,16 @@ alloc_init_file(struct inode *ino)
fp->fi_lease = NULL;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#if defined(CONFIG_PNFSD)
+ INIT_LIST_HEAD(&fp->fi_layouts);
+ INIT_LIST_HEAD(&fp->fi_layout_states);
+ fp->fi_fsid.major = current_fh->fh_export->ex_fsid;
+ fp->fi_fsid.minor = 0;
+ fp->fi_fhlen = current_fh->fh_handle.fh_size;
+ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval));
+ memcpy(fp->fi_fhval, &current_fh->fh_handle.fh_base,
+ fp->fi_fhlen);
+#endif /* CONFIG_PNFSD */
spin_lock(&recall_lock);
list_add(&fp->fi_hash, &file_hashtbl[hashval]);
spin_unlock(&recall_lock);
@@ -2076,7 +2157,7 @@ alloc_init_file(struct inode *ino)
return NULL;
}
-static void
+void
nfsd4_free_slab(struct kmem_cache **slab)
{
if (*slab == NULL)
@@ -2092,6 +2173,7 @@ nfsd4_free_slabs(void)
nfsd4_free_slab(&file_slab);
nfsd4_free_slab(&stateid_slab);
nfsd4_free_slab(&deleg_slab);
+ nfsd4_free_pnfs_slabs();
}
static int
@@ -2113,6 +2195,8 @@ nfsd4_init_slabs(void)
sizeof(struct nfs4_delegation), 0, 0, NULL);
if (deleg_slab == NULL)
goto out_nomem;
+ if (nfsd4_init_pnfs_slabs())
+ goto out_nomem;
return 0;
out_nomem:
nfsd4_free_slabs();
@@ -2186,6 +2270,9 @@ init_stateid(struct nfs4_stateid *stp, s
INIT_LIST_HEAD(&stp->st_perstateowner);
INIT_LIST_HEAD(&stp->st_lockowners);
INIT_LIST_HEAD(&stp->st_perfile);
+#if defined(CONFIG_PNFSD)
+ INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
+#endif /* CONFIG_PNFSD */
list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
list_add(&stp->st_perstateowner, &sop->so_stateids);
list_add(&stp->st_perfile, &fp->fi_stateids);
@@ -2227,6 +2314,7 @@ find_openstateowner_str(unsigned int has
{
struct nfs4_stateowner *so = NULL;
+ BUG_ON_UNLOCKED_STATE();
list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
if (same_owner_str(so, &open->op_owner, &open->op_clientid))
return so;
@@ -2235,7 +2323,7 @@ find_openstateowner_str(unsigned int has
}
/* search file_hashtbl[] for file */
-static struct nfs4_file *
+struct nfs4_file *
find_file(struct inode *ino)
{
unsigned int hashval = file_hashval(ino);
@@ -2253,6 +2341,18 @@ find_file(struct inode *ino)
return NULL;
}
+struct nfs4_file *
+find_alloc_file(struct inode *ino, struct svc_fh *current_fh)
+{
+ struct nfs4_file *fp;
+
+ fp = find_file(ino);
+ if (fp)
+ return fp;
+
+ return alloc_init_file(ino, current_fh);
+}
+
static inline int access_valid(u32 x, u32 minorversion)
{
if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
@@ -2787,7 +2887,7 @@ nfsd4_process_open2(struct svc_rqst *rqs
if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
goto out;
status = nfserr_resource;
- fp = alloc_init_file(ino);
+ fp = alloc_init_file(ino, current_fh);
if (fp == NULL)
goto out;
}
@@ -3006,7 +3106,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct
return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
}
-static int
+int
STALE_STATEID(stateid_t *stateid)
{
if (stateid->si_boot == boot_time)
@@ -3016,6 +3116,16 @@ STALE_STATEID(stateid_t *stateid)
return 1;
}
+__be32
+nfs4_check_stateid(stateid_t *stateid)
+{
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+ return nfserr_bad_stateid;
+ if (STALE_STATEID(stateid))
+ return nfserr_stale_stateid;
+ return 0;
+}
+
static inline int
access_permit_read(unsigned long access_bmap)
{
@@ -3127,6 +3237,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_
if (grace_disallows_io(ino))
return nfserr_grace;
+#if defined(CONFIG_PNFSD)
+ if (pnfs_fh_is_ds(&current_fh->fh_handle)) {
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+ status = nfserr_bad_stateid;
+ else
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+ {
+ dprintk("%s Don't check DS stateid\n", __func__);
+ return 0;
+ }
+#else /* CONFIG_GFS2_FS_LOCKING_DLM */
+ status = nfs4_preprocess_pnfs_ds_stateid(current_fh,
+ stateid);
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+ goto out;
+ }
+#endif /* CONFIG_PNFSD */
+
if (nfsd4_has_session(cstate))
flags |= HAS_SESSION;
@@ -3214,13 +3342,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co
*stpp = NULL;
*sopp = NULL;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
- return nfserr_bad_stateid;
- }
-
- if (STALE_STATEID(stateid))
- return nfserr_stale_stateid;
+ status = nfs4_check_stateid(stateid);
+ if (status)
+ return status;
if (nfsd4_has_session(cstate))
flags |= HAS_SESSION;
@@ -3495,11 +3619,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp
if (nfsd4_has_session(cstate))
flags |= HAS_SESSION;
nfs4_lock_state();
- status = nfserr_bad_stateid;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
- goto out;
- status = nfserr_stale_stateid;
- if (STALE_STATEID(stateid))
+ status = nfs4_check_stateid(stateid);
+ if (status)
goto out;
status = nfserr_bad_stateid;
if (!is_delegation_stateid(stateid))
@@ -3529,26 +3650,6 @@ out:
#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
-static inline u64
-end_offset(u64 start, u64 len)
-{
- u64 end;
-
- end = start + len;
- return end >= start ? end: NFS4_MAX_UINT64;
-}
-
-/* last octet in a range */
-static inline u64
-last_byte_offset(u64 start, u64 len)
-{
- u64 end;
-
- BUG_ON(!len);
- end = start + len;
- return end > start ? end - 1: NFS4_MAX_UINT64;
-}
-
#define lockownerid_hashval(id) \
((id) & LOCK_HASH_MASK)
@@ -3565,7 +3666,7 @@ static struct list_head lock_ownerid_has
static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
-static struct nfs4_stateid *
+struct nfs4_stateid *
find_stateid(stateid_t *stid, int flags)
{
struct nfs4_stateid *local;
@@ -3594,7 +3695,7 @@ find_stateid(stateid_t *stid, int flags)
return NULL;
}
-static struct nfs4_delegation *
+struct nfs4_delegation *
find_delegation_stateid(struct inode *ino, stateid_t *stid)
{
struct nfs4_file *fp;
@@ -3725,6 +3826,9 @@ alloc_init_lock_stateid(struct nfs4_stat
INIT_LIST_HEAD(&stp->st_perfile);
INIT_LIST_HEAD(&stp->st_perstateowner);
INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
+#if defined(CONFIG_PNFSD)
+ INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
+#endif /* CONFIG_PNFSD */
list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
list_add(&stp->st_perfile, &fp->fi_stateids);
list_add(&stp->st_perstateowner, &sop->so_stateids);
@@ -4301,6 +4405,9 @@ nfs4_state_init(void)
INIT_LIST_HEAD(&client_lru);
INIT_LIST_HEAD(&del_recall_lru);
reclaim_str_hashtbl_size = 0;
+#if defined(CONFIG_PNFSD)
+ nfs4_pnfs_state_init();
+#endif /* CONFIG_PNFSD */
return 0;
}
@@ -4405,6 +4512,7 @@ __nfs4_state_shutdown(void)
}
nfsd4_shutdown_recdir();
+ nfs4_pnfs_state_shutdown();
}
void
diff -up linux-2.6.38.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4xdr.c
--- linux-2.6.38.noarch/fs/nfsd/nfs4xdr.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfs4xdr.c 2011-03-26 07:57:44.294821141 -0400
@@ -45,11 +45,16 @@
#include <linux/statfs.h>
#include <linux/utsname.h>
#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/exportfs.h>
+#include <linux/nfsd/nfs4layoutxdr.h>
+#include <linux/nfsd4_spnfs.h>
+#include <linux/nfsd4_block.h>
#include "idmap.h"
#include "acl.h"
#include "xdr4.h"
#include "vfs.h"
+#include "pnfsd.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -1279,6 +1284,138 @@ static __be32 nfsd4_decode_reclaim_compl
DECODE_TAIL;
}
+#if defined(CONFIG_PNFSD)
+static __be32
+nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp,
+ struct nfsd4_pnfs_getdevlist *gdevl)
+{
+ DECODE_HEAD;
+
+ READ_BUF(16 + sizeof(nfs4_verifier));
+ READ32(gdevl->gd_layout_type);
+ READ32(gdevl->gd_maxdevices);
+ READ64(gdevl->gd_cookie);
+ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp,
+ struct nfsd4_pnfs_getdevinfo *gdev)
+{
+ u32 num;
+ DECODE_HEAD;
+
+ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid));
+ READ64(gdev->gd_devid.sbid);
+ READ64(gdev->gd_devid.devid);
+ READ32(gdev->gd_layout_type);
+ READ32(gdev->gd_maxcount);
+ READ32(num);
+ if (num) {
+ READ_BUF(4);
+ READ32(gdev->gd_notify_types);
+ } else {
+ gdev->gd_notify_types = 0;
+ }
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+ struct nfsd4_pnfs_layoutget *lgp)
+{
+ DECODE_HEAD;
+
+ READ_BUF(36);
+ READ32(lgp->lg_signal);
+ READ32(lgp->lg_seg.layout_type);
+ READ32(lgp->lg_seg.iomode);
+ READ64(lgp->lg_seg.offset);
+ READ64(lgp->lg_seg.length);
+ READ64(lgp->lg_minlength);
+ nfsd4_decode_stateid(argp, &lgp->lg_sid);
+ READ_BUF(4);
+ READ32(lgp->lg_maxcount);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+ struct nfsd4_pnfs_layoutcommit *lcp)
+{
+ DECODE_HEAD;
+ u32 timechange;
+
+ READ_BUF(20);
+ READ64(lcp->args.lc_seg.offset);
+ READ64(lcp->args.lc_seg.length);
+ READ32(lcp->args.lc_reclaim);
+ nfsd4_decode_stateid(argp, &lcp->lc_sid);
+ READ_BUF(4);
+ READ32(lcp->args.lc_newoffset);
+ if (lcp->args.lc_newoffset) {
+ READ_BUF(8);
+ READ64(lcp->args.lc_last_wr);
+ } else
+ lcp->args.lc_last_wr = 0;
+ READ_BUF(4);
+ READ32(timechange);
+ if (timechange) {
+ READ_BUF(12);
+ READ64(lcp->args.lc_mtime.seconds);
+ READ32(lcp->args.lc_mtime.nseconds);
+ } else {
+ lcp->args.lc_mtime.seconds = 0;
+ lcp->args.lc_mtime.nseconds = 0;
+ }
+ READ_BUF(8);
+ READ32(lcp->args.lc_seg.layout_type);
+ /* XXX: saving XDR'ed layout update. Since we don't have the
+ * current_fh yet, and therefore no export_ops, we can't call
+ * the layout specific decode routines. File and pVFS2
+ * do not use the layout update....
+ */
+ READ32(lcp->args.lc_up_len);
+ if (lcp->args.lc_up_len > 0) {
+ READ_BUF(lcp->args.lc_up_len);
+ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len);
+ }
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+ struct nfsd4_pnfs_layoutreturn *lrp)
+{
+ DECODE_HEAD;
+
+ READ_BUF(16);
+ READ32(lrp->args.lr_reclaim);
+ READ32(lrp->args.lr_seg.layout_type);
+ READ32(lrp->args.lr_seg.iomode);
+ READ32(lrp->args.lr_return_type);
+ if (lrp->args.lr_return_type == RETURN_FILE) {
+ READ_BUF(16);
+ READ64(lrp->args.lr_seg.offset);
+ READ64(lrp->args.lr_seg.length);
+ nfsd4_decode_stateid(argp, &lrp->lr_sid);
+ READ_BUF(4);
+ READ32(lrp->args.lrf_body_len);
+ if (lrp->args.lrf_body_len > 0) {
+ READ_BUF(lrp->args.lrf_body_len);
+ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len);
+ }
+ }
+
+ DECODE_TAIL;
+}
+#endif /* CONFIG_PNFSD */
+
static __be32
nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
{
@@ -1380,11 +1517,19 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
[OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+#if defined(CONFIG_PNFSD)
+ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo,
+ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist,
+ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
+ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
+ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else /* CONFIG_PNFSD */
[OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
+#endif /* CONFIG_PNFSD */
[OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
[OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2191,6 +2336,36 @@ out_acl:
}
WRITE64(stat.ino);
}
+#if defined(CONFIG_PNFSD)
+ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+ struct super_block *sb = dentry->d_inode->i_sb;
+ int type = 0;
+
+ /* Query the filesystem for supported pNFS layout types.
+ * Currently, we only support one layout type per file system.
+ * The export_ops->layout_type() returns the pnfs_layouttype4.
+ */
+ buflen -= 4;
+ if (buflen < 0) /* length */
+ goto out_resource;
+
+ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type)
+ type = sb->s_pnfs_op->layout_type(sb);
+ if (type) {
+ if ((buflen -= 4) < 0) /* type */
+ goto out_resource;
+ WRITE32(1); /* length */
+ WRITE32(type); /* type */
+ } else
+ WRITE32(0); /* length */
+ }
+
+ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+ if ((buflen -= 4) < 0)
+ goto out_resource;
+ WRITE32(stat.blksize);
+ }
+#endif /* CONFIG_PNFSD */
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
WRITE32(3);
WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
@@ -2434,6 +2609,10 @@ nfsd4_encode_commit(struct nfsd4_compoun
if (!nfserr) {
RESERVE_SPACE(8);
WRITEMEM(commit->co_verf.data, 8);
+ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n",
+ ((u32 *)(&commit->co_verf.data))[0],
+ ((u32 *)(&commit->co_verf.data))[1]);
+
ADJUST_ARGS();
}
return nfserr;
@@ -2688,6 +2867,13 @@ nfsd4_encode_read(struct nfsd4_compoundr
}
read->rd_vlen = v;
+#if defined(CONFIG_SPNFS)
+ if (spnfs_enabled())
+ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode,
+ read->rd_offset, &maxcount, read->rd_vlen,
+ resp->rqstp);
+ else /* we're not an MDS */
+#endif /* CONFIG_SPNFS */
nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
&maxcount);
@@ -3007,6 +3193,9 @@ nfsd4_encode_write(struct nfsd4_compound
WRITE32(write->wr_bytes_written);
WRITE32(write->wr_how_written);
WRITEMEM(write->wr_verifier.data, 8);
+ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n",
+ ((u32 *)(&write->wr_verifier.data))[0],
+ ((u32 *)(&write->wr_verifier.data))[1]);
ADJUST_ARGS();
}
return nfserr;
@@ -3146,6 +3335,343 @@ nfsd4_encode_sequence(struct nfsd4_compo
return 0;
}
+#if defined(CONFIG_PNFSD)
+
+/* Uses the export interface to iterate through the available devices
+ * and encodes them on the response stream.
+ */
+static __be32
+nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp,
+ struct nfsd4_pnfs_getdevlist *gdevl,
+ unsigned int *dev_count)
+{
+ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb;
+ __be32 nfserr;
+ int status;
+ __be32 *p;
+ struct nfsd4_pnfs_dev_iter_res res = {
+ .gd_cookie = gdevl->gd_cookie,
+ .gd_verf = gdevl->gd_verf,
+ .gd_eof = 0
+ };
+ u64 sbid;
+
+ dprintk("%s: Begin\n", __func__);
+
+ sbid = find_create_sbid(sb);
+ *dev_count = 0;
+ do {
+ status = sb->s_pnfs_op->get_device_iter(sb,
+ gdevl->gd_layout_type,
+ &res);
+ if (status) {
+ if (status == -ENOENT) {
+ res.gd_eof = 1;
+ /* return success */
+ break;
+ }
+ nfserr = nfserrno(status);
+ goto out_err;
+ }
+
+ /* Encode device id and layout type */
+ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid));
+ WRITE64((__be64)sbid);
+ WRITE64(res.gd_devid); /* devid minor */
+ ADJUST_ARGS();
+ (*dev_count)++;
+ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof);
+ gdevl->gd_cookie = res.gd_cookie;
+ gdevl->gd_verf = res.gd_verf;
+ gdevl->gd_eof = res.gd_eof;
+ nfserr = nfs_ok;
+out_err:
+ dprintk("%s: Encoded %u devices\n", __func__, *dev_count);
+ return nfserr;
+}
+
+/* Encodes the response of get device list.
+*/
+static __be32
+nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_pnfs_getdevlist *gdevl)
+{
+ unsigned int dev_count = 0, lead_count;
+ u32 *p_in = resp->p;
+ __be32 *p;
+
+ dprintk("%s: err %d\n", __func__, nfserr);
+ if (nfserr)
+ return nfserr;
+
+ /* Ensure we have room for cookie, verifier, and devlist len,
+ * which we will backfill in after we encode as many devices as possible
+ */
+ lead_count = 8 + sizeof(nfs4_verifier) + 4;
+ RESERVE_SPACE(lead_count);
+ /* skip past these values */
+ p += XDR_QUADLEN(lead_count);
+ ADJUST_ARGS();
+
+ /* Iterate over as many device ids as possible on the xdr stream */
+ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count);
+ if (nfserr)
+ goto out_err;
+
+ /* Backfill in cookie, verf and number of devices encoded */
+ p = p_in;
+ WRITE64(gdevl->gd_cookie);
+ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
+ WRITE32(dev_count);
+
+ /* Skip over devices */
+ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid));
+ ADJUST_ARGS();
+
+ /* are we at the end of devices? */
+ RESERVE_SPACE(4);
+ WRITE32(gdevl->gd_eof);
+ ADJUST_ARGS();
+
+ dprintk("%s: done.\n", __func__);
+
+ nfserr = nfs_ok;
+out:
+ return nfserr;
+out_err:
+ p = p_in;
+ ADJUST_ARGS();
+ goto out;
+}
+
+/* For a given device id, have the file system retrieve and encode the
+ * associated device. For file layout, the encoding function is
+ * passed down to the file system. The file system then has the option
+ * of using this encoding function or one of its own.
+ *
+ * Note: the file system must return the XDR size of struct device_addr4
+ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the
+ * gdir_mincount calculation.
+ */
+static __be32
+nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_pnfs_getdevinfo *gdev)
+{
+ struct super_block *sb;
+ int maxcount = 0, type_notify_len = 12;
+ __be32 *p, *p_save = NULL, *p_in = resp->p;
+ struct exp_xdr_stream xdr;
+
+ dprintk("%s: err %d\n", __func__, nfserr);
+ if (nfserr)
+ return nfserr;
+
+ sb = gdev->gd_sb;
+
+ if (gdev->gd_maxcount != 0) {
+ /* FIXME: this will be bound by the session max response */
+ maxcount = svc_max_payload(resp->rqstp);
+ if (maxcount > gdev->gd_maxcount)
+ maxcount = gdev->gd_maxcount;
+
+ /* Ensure have room for type and notify field */
+ maxcount -= type_notify_len;
+ if (maxcount < 0) {
+ nfserr = -ETOOSMALL;
+ goto toosmall;
+ }
+ }
+
+ RESERVE_SPACE(4);
+ WRITE32(gdev->gd_layout_type);
+ ADJUST_ARGS();
+
+ /* If maxcount is 0 then just update notifications */
+ if (gdev->gd_maxcount == 0)
+ goto handle_notifications;
+
+ xdr.p = p_save = resp->p;
+ xdr.end = resp->end;
+ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
+ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
+
+ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type,
+ &gdev->gd_devid);
+ if (nfserr)
+ goto err;
+
+ /* The file system should never write 0 bytes without
+ * returning an error
+ */
+ BUG_ON(xdr.p == p_save);
+ BUG_ON(xdr.p > xdr.end);
+
+ /* Update the xdr stream with the number of bytes encoded
+ * by the file system.
+ */
+ p = xdr.p;
+ ADJUST_ARGS();
+
+handle_notifications:
+ /* Encode supported device notifications */
+ RESERVE_SPACE(4);
+ if (sb->s_pnfs_op->set_device_notify) {
+ struct pnfs_devnotify_arg dn_args;
+
+ dn_args.dn_layout_type = gdev->gd_layout_type;
+ dn_args.dn_devid = gdev->gd_devid;
+ dn_args.dn_notify_types = gdev->gd_notify_types;
+ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args);
+ if (nfserr)
+ goto err;
+ WRITE32(dn_args.dn_notify_types);
+ } else {
+ WRITE32(0);
+ }
+ ADJUST_ARGS();
+
+out:
+ return nfserrno(nfserr);
+toosmall:
+ dprintk("%s: maxcount too small\n", __func__);
+ RESERVE_SPACE(4);
+ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len);
+ ADJUST_ARGS();
+ goto out;
+err:
+ /* Rewind to the beginning */
+ p = p_in;
+ ADJUST_ARGS();
+ if (nfserr == -ETOOSMALL)
+ goto toosmall;
+ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr);
+ goto out;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp,
+ __be32 nfserr,
+ struct nfsd4_pnfs_layoutget *lgp)
+{
+ int maxcount, leadcount;
+ struct super_block *sb;
+ struct exp_xdr_stream xdr;
+ __be32 *p, *p_save, *p_start = resp->p;
+
+ dprintk("%s: err %d\n", __func__, nfserr);
+ if (nfserr)
+ return nfserr;
+
+ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb;
+ maxcount = PAGE_SIZE;
+ if (maxcount > lgp->lg_maxcount)
+ maxcount = lgp->lg_maxcount;
+
+ /* Check for space on xdr stream */
+ leadcount = 36 + sizeof(stateid_opaque_t);
+ RESERVE_SPACE(leadcount);
+ /* encode layout metadata after file system encodes layout */
+ p += XDR_QUADLEN(leadcount);
+ ADJUST_ARGS();
+
+ /* Ensure have room for ret_on_close, off, len, iomode, type */
+ maxcount -= leadcount;
+ if (maxcount < 0) {
+ printk(KERN_ERR "%s: buffer too small\n", __func__);
+ nfserr = nfserr_toosmall;
+ goto err;
+ }
+
+ /* Set xdr info so file system can encode layout */
+ xdr.p = p_save = resp->p;
+ xdr.end = resp->end;
+ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
+ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
+
+ /* Retrieve, encode, and merge layout; process stateid */
+ nfserr = nfs4_pnfs_get_layout(lgp, &xdr);
+ if (nfserr)
+ goto err;
+
+ /* Ensure file system returned enough bytes for the client
+ * to access.
+ */
+ if (lgp->lg_seg.length < lgp->lg_minlength) {
+ nfserr = nfserr_badlayout;
+ goto err;
+ }
+
+ /* The file system should never write 0 bytes without
+ * returning an error
+ */
+ BUG_ON(xdr.p == p_save);
+
+ /* Rewind to beginning and encode attrs */
+ resp->p = p_start;
+ RESERVE_SPACE(4);
+ WRITE32(lgp->lg_roc); /* return on close */
+ ADJUST_ARGS();
+ nfsd4_encode_stateid(resp, &lgp->lg_sid);
+ RESERVE_SPACE(28);
+ /* Note: response logr_layout array count, always one for now */
+ WRITE32(1);
+ WRITE64(lgp->lg_seg.offset);
+ WRITE64(lgp->lg_seg.length);
+ WRITE32(lgp->lg_seg.iomode);
+ WRITE32(lgp->lg_seg.layout_type);
+
+ /* Update the xdr stream with the number of bytes written
+ * by the file system
+ */
+ p = xdr.p;
+ ADJUST_ARGS();
+
+ return nfs_ok;
+err:
+ resp->p = p_start;
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_pnfs_layoutcommit *lcp)
+{
+ __be32 *p;
+
+ if (nfserr)
+ goto out;
+
+ RESERVE_SPACE(4);
+ WRITE32(lcp->res.lc_size_chg);
+ ADJUST_ARGS();
+ if (lcp->res.lc_size_chg) {
+ RESERVE_SPACE(8);
+ WRITE64(lcp->res.lc_newsize);
+ ADJUST_ARGS();
+ }
+out:
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_pnfs_layoutreturn *lrp)
+{
+ __be32 *p;
+
+ if (nfserr)
+ goto out;
+
+ RESERVE_SPACE(4);
+ WRITE32(lrp->lrs_present != 0); /* got stateid? */
+ ADJUST_ARGS();
+ if (lrp->lrs_present)
+ nfsd4_encode_stateid(resp, &lrp->lr_sid);
+out:
+ return nfserr;
+}
+#endif /* CONFIG_PNFSD */
+
static __be32
nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
{
@@ -3206,11 +3732,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
[OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+#if defined(CONFIG_PNFSD)
+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo,
+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist,
+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else /* CONFIG_PNFSD */
[OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+#endif /* CONFIG_PNFSD */
[OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
[OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
diff -up linux-2.6.38.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.38.noarch/fs/nfsd/nfsctl.c
--- linux-2.6.38.noarch/fs/nfsd/nfsctl.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfsctl.c 2011-03-26 07:57:44.295821132 -0400
@@ -12,11 +12,16 @@
#include <linux/nfsd/syscall.h>
#include <linux/lockd/lockd.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/nfsd/nfs4pnfsdlm.h>
#include "idmap.h"
#include "nfsd.h"
#include "cache.h"
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
+#include <linux/nfsd4_spnfs.h>
+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
+
/*
* We have a single directory with 9 nodes in it.
*/
@@ -51,6 +56,9 @@ enum {
NFSD_Gracetime,
NFSD_RecoveryDir,
#endif
+#ifdef CONFIG_PNFSD
+ NFSD_pnfs_dlm_device,
+#endif
};
/*
@@ -78,6 +86,9 @@ static ssize_t write_leasetime(struct fi
static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
#endif
+#ifdef CONFIG_PNFSD
+static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size);
+#endif
static ssize_t (*write_op[])(struct file *, char *, size_t) = {
#ifdef CONFIG_NFSD_DEPRECATED
@@ -102,6 +113,9 @@ static ssize_t (*write_op[])(struct file
[NFSD_Gracetime] = write_gracetime,
[NFSD_RecoveryDir] = write_recoverydir,
#endif
+#ifdef CONFIG_PNFSD
+ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device,
+#endif
};
static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
@@ -1366,6 +1380,68 @@ static ssize_t write_recoverydir(struct
#endif
+#ifdef CONFIG_PNFSD
+
+static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf,
+ size_t size)
+{
+ char *mesg = buf;
+ char *pnfs_dlm_device;
+ int max_size = NFSD_PNFS_DLM_DEVICE_MAX;
+ int len, ret = 0;
+
+ if (size > 0) {
+ ret = -EINVAL;
+ if (size > max_size || buf[size-1] != '\n')
+ return ret;
+ buf[size-1] = 0;
+
+ pnfs_dlm_device = mesg;
+ len = qword_get(&mesg, pnfs_dlm_device, size);
+ if (len <= 0)
+ return ret;
+
+ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len);
+ } else
+ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT);
+
+ return ret <= 0 ? ret : strlen(buf);
+}
+
+/**
+ * write_pnfs_dlm_device - Set or report the current pNFS data server list
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing a block device name,
+ * a colon, and then a comma separated
+ * list of pNFS data server IPv4 addresses
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing a block device name, a colon, and
+ * then a comma separated list of pNFS
+ * data server IPv4 addresses.
+ * return code is the size in bytes of the string
+ * On error: return code is a negative errno value
+ */
+static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __write_pnfs_dlm_device(file, buf, size);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+#endif /* CONFIG_PNFSD */
+
/*----------------------------------------------------------------------------*/
/*
* populating the filesystem.
@@ -1402,6 +1478,10 @@ static int nfsd_fill_super(struct super_
[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
#endif
+#ifdef CONFIG_PNFSD
+ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops,
+ S_IWUSR|S_IRUSR},
+#endif
/* last one */ {""}
};
return simple_fill_super(sb, 0x6e667364, nfsd_files);
@@ -1440,6 +1520,9 @@ static int create_proc_exports_entry(voi
}
#endif
+#if defined(CONFIG_SPNFS_BLOCK)
+int nfsd_bl_init(void);
+#endif
static int __init init_nfsd(void)
{
int retval;
@@ -1462,6 +1545,15 @@ static int __init init_nfsd(void)
retval = create_proc_exports_entry();
if (retval)
goto out_free_idmap;
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
+ retval = spnfs_init_proc();
+ if (retval != 0)
+ goto out_free_idmap;
+#if defined(CONFIG_SPNFS_BLOCK)
+ nfsd_bl_init();
+#endif /* CONFIG_SPNFS_BLOCK */
+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
+
retval = register_filesystem(&nfsd_fs_type);
if (retval)
goto out_free_all;
@@ -1484,7 +1576,22 @@ out_free_stat:
static void __exit exit_nfsd(void)
{
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
+ remove_proc_entry("fs/nfs/spnfs/recall", NULL);
+ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
+ remove_proc_entry("fs/nfs/spnfs/getfh", NULL);
+ remove_proc_entry("fs/nfs/spnfs/config", NULL);
+ remove_proc_entry("fs/nfs/spnfs/ctl", NULL);
+ remove_proc_entry("fs/nfs/spnfs", NULL);
+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
+ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
+ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL);
+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */
+
nfsd_export_shutdown();
+ nfsd4_pnfs_dlm_shutdown();
nfsd_reply_cache_shutdown();
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
diff -up linux-2.6.38.noarch/fs/nfsd/nfsd.h.orig linux-2.6.38.noarch/fs/nfsd/nfsd.h
--- linux-2.6.38.noarch/fs/nfsd/nfsd.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfsd.h 2011-03-26 07:57:44.296821123 -0400
@@ -287,11 +287,22 @@ extern time_t nfsd4_grace;
#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
NFSD4_SUPPORTED_ATTRS_WORD0
+#if defined(CONFIG_PNFSD)
+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES)
+#else /* CONFIG_PNFSD */
#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
NFSD4_SUPPORTED_ATTRS_WORD1
+#endif /* CONFIG_PNFSD */
+#if defined(CONFIG_PNFSD)
+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \
+ FATTR4_WORD2_LAYOUT_BLKSIZE)
+#else /* CONFIG_PNFSD */
#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+#endif /* CONFIG_PNFSD */
static inline u32 nfsd_suppattrs0(u32 minorversion)
{
diff -up linux-2.6.38.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.38.noarch/fs/nfsd/nfsfh.c
--- linux-2.6.38.noarch/fs/nfsd/nfsfh.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfsfh.c 2011-03-26 07:57:44.297821114 -0400
@@ -10,6 +10,7 @@
#include <linux/exportfs.h>
#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/nfsd/nfsd4_pnfs.h>
#include "nfsd.h"
#include "vfs.h"
#include "auth.h"
@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s
static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
+ int fsid_type;
struct fid *fid = NULL, sfid;
struct svc_export *exp;
struct dentry *dentry;
@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct
return error;
if (fh->fh_auth_type != 0)
return error;
- len = key_len(fh->fh_fsid_type) / 4;
+ fsid_type = pnfs_fh_fsid_type(fh);
+ len = key_len(fsid_type) / 4;
if (len == 0)
return error;
if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct
data_left -= len;
if (data_left < 0)
return error;
- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
+ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth);
fid = (struct fid *)(fh->fh_auth + len);
} else {
__u32 tfh[2];
diff -up linux-2.6.38.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.38.noarch/fs/nfsd/nfsfh.h
--- linux-2.6.38.noarch/fs/nfsd/nfsfh.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfsfh.h 2011-03-26 07:57:44.298821106 -0400
@@ -14,6 +14,7 @@ enum nfsd_fsid {
FSID_UUID8,
FSID_UUID16,
FSID_UUID16_INUM,
+ FSID_MAX
};
enum fsid_source {
@@ -203,4 +204,42 @@ fh_unlock(struct svc_fh *fhp)
}
}
+#if defined(CONFIG_PNFSD)
+
+/*
+ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied
+ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how
+ * to handle a given stateid.
+ */
+static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
+{
+ return fh->fh_fsid_type >= FSID_MAX;
+}
+
+static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh)
+{
+ BUG_ON(fh->fh_version != 1);
+ BUG_ON(pnfs_fh_is_ds(fh));
+ fh->fh_fsid_type += FSID_MAX;
+}
+
+#else /* CONFIG_PNFSD */
+
+static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PNFSD */
+
+/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */
+static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh)
+{
+ int fsid_type = fh->fh_fsid_type;
+
+ if (pnfs_fh_is_ds(fh))
+ return fsid_type - FSID_MAX;
+ return fsid_type;
+}
+
#endif /* _LINUX_NFSD_FH_INT_H */
diff -up linux-2.6.38.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.38.noarch/fs/nfsd/nfssvc.c
--- linux-2.6.38.noarch/fs/nfsd/nfssvc.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/nfssvc.c 2011-03-26 07:57:44.298821106 -0400
@@ -116,7 +116,7 @@ struct svc_program nfsd_program = {
};
-u32 nfsd_supported_minorversion;
+u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION;
int nfsd_vers(int vers, enum vers_op change)
{
diff -up linux-2.6.38.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.38.noarch/fs/nfsd/pnfsd.h
--- linux-2.6.38.noarch/fs/nfsd/pnfsd.h.orig 2011-03-26 07:57:44.299821098 -0400
+++ linux-2.6.38.noarch/fs/nfsd/pnfsd.h 2011-03-26 07:57:44.299821098 -0400
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2005 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LINUX_NFSD_PNFSD_H
+#define LINUX_NFSD_PNFSD_H
+
+#include <linux/list.h>
+#include <linux/nfsd/nfsd4_pnfs.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+/* outstanding layout stateid */
+struct nfs4_layout_state {
+ struct list_head ls_perfile;
+ struct list_head ls_layouts; /* list of nfs4_layouts */
+ struct kref ls_ref;
+ struct nfs4_client *ls_client;
+ struct nfs4_file *ls_file;
+ stateid_t ls_stateid;
+};
+
+/* outstanding layout */
+struct nfs4_layout {
+ struct list_head lo_perfile; /* hash by f_id */
+ struct list_head lo_perclnt; /* hash by clientid */
+ struct list_head lo_perstate;
+ struct nfs4_file *lo_file; /* backpointer */
+ struct nfs4_client *lo_client;
+ struct nfs4_layout_state *lo_state;
+ struct nfsd4_layout_seg lo_seg;
+};
+
+struct pnfs_inval_state {
+ struct knfsd_fh mdsfh; /* needed only by invalidate all */
+ stateid_t stid;
+ clientid_t clid;
+ u32 status;
+};
+
+/* pNFS Data Server state */
+#define DS_STATEID_VALID 0
+#define DS_STATEID_ERROR 1
+#define DS_STATEID_NEW 2
+
+struct pnfs_ds_stateid {
+ struct list_head ds_hash; /* ds_stateid hash entry */
+ struct list_head ds_perclid; /* per client hash entry */
+ stateid_t ds_stid;
+ struct knfsd_fh ds_fh;
+ unsigned long ds_access;
+ u32 ds_status; /* from MDS */
+ u32 ds_verifier[2]; /* from MDS */
+ wait_queue_head_t ds_waitq;
+ unsigned long ds_flags;
+ struct kref ds_ref;
+ clientid_t ds_mdsclid;
+};
+
+struct pnfs_ds_clientid {
+ struct list_head dc_hash; /* mds_clid_hashtbl entry */
+ struct list_head dc_stateid; /* ds_stateid head */
+ struct list_head dc_permdsid; /* per mdsid hash entry */
+ clientid_t dc_mdsclid;
+ struct kref dc_ref;
+ uint32_t dc_mdsid;
+};
+
+struct pnfs_mds_id {
+ struct list_head di_hash; /* mds_nodeid list entry */
+ struct list_head di_mdsclid; /* mds_clientid head */
+ uint32_t di_mdsid;
+ time_t di_mdsboot; /* mds boot time */
+ struct kref di_ref;
+};
+
+/* notify device request (from exported filesystem) */
+struct nfs4_notify_device {
+ struct nfsd4_pnfs_cb_dev_list *nd_list;
+ struct nfs4_client *nd_client;
+ struct list_head nd_perclnt;
+
+ /* nfsd internal */
+ struct nfsd4_callback nd_recall;
+};
+
+u64 find_create_sbid(struct super_block *);
+struct super_block *find_sbid_id(u64);
+__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *);
+int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *,
+ struct nfsd4_pnfs_layoutreturn *);
+int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *);
+int nfs4_pnfs_cb_change_state(struct pnfs_get_state *);
+void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
+int put_layoutrecall(struct nfs4_layoutrecall *);
+void nomatching_layout(struct nfs4_layoutrecall *);
+void *layoutrecall_done(struct nfs4_layoutrecall *);
+void nfsd4_cb_layout(struct nfs4_layoutrecall *);
+int nfsd_layout_recall_cb(struct super_block *, struct inode *,
+ struct nfsd4_pnfs_cb_layout *);
+int nfsd_device_notify_cb(struct super_block *,
+ struct nfsd4_pnfs_cb_dev_list *);
+void nfsd4_cb_notify_device(struct nfs4_notify_device *);
+void pnfs_set_device_notify(clientid_t *, unsigned int types);
+void pnfs_clear_device_notify(struct nfs4_client *);
+
+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
+extern struct sockaddr pnfsd_lexp_addr;
+extern size_t pnfs_lexp_addr_len;
+
+extern void pnfsd_lexp_init(struct inode *);
+extern bool is_inode_pnfsd_lexp(struct inode *);
+extern int pnfsd_lexp_recall_layout(struct inode *);
+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
+
+#endif /* LINUX_NFSD_PNFSD_H */
diff -up linux-2.6.38.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.38.noarch/fs/nfsd/pnfsd_lexp.c
--- linux-2.6.38.noarch/fs/nfsd/pnfsd_lexp.c.orig 2011-03-26 07:57:44.300821090 -0400
+++ linux-2.6.38.noarch/fs/nfsd/pnfsd_lexp.c 2011-03-26 07:57:44.300821090 -0400
@@ -0,0 +1,296 @@
+/*
+ * linux/fs/nfsd/pnfs_lexp.c
+ *
+ * pNFS export of local filesystems.
+ *
+ * Export local file systems over the files layout type.
+ * The MDS (metadata server) functions also as a single DS (data server).
+ * This is mostly useful for development and debugging purposes.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) 2008 Benny Halevy, <bhalevy@panasas.com>
+ *
+ * Initial implementation was based on the pnfs-gfs2 patches done
+ * by David M. Richter <richterd@citi.umich.edu>
+ */
+
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/nfsd/nfs4layoutxdr.h>
+
+#include "pnfsd.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+struct sockaddr pnfsd_lexp_addr;
+size_t pnfs_lexp_addr_len;
+
+static wait_queue_head_t lo_recall_wq;
+
+static int
+pnfsd_lexp_layout_type(struct super_block *sb)
+{
+ int ret = LAYOUT_NFSV4_1_FILES;
+ dprintk("<-- %s: return %d\n", __func__, ret);
+ return ret;
+}
+
+static int
+pnfsd_lexp_get_device_iter(struct super_block *sb,
+ u32 layout_type,
+ struct nfsd4_pnfs_dev_iter_res *res)
+{
+ dprintk("--> %s: sb=%p\n", __func__, sb);
+
+ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
+
+ res->gd_eof = 1;
+ if (res->gd_cookie)
+ return -ENOENT;
+ res->gd_cookie = 1;
+ res->gd_verf = 1;
+ res->gd_devid = 1;
+
+ dprintk("<-- %s: return 0\n", __func__);
+ return 0;
+}
+
+static int
+pnfsd_lexp_get_device_info(struct super_block *sb,
+ struct exp_xdr_stream *xdr,
+ u32 layout_type,
+ const struct nfsd4_pnfs_deviceid *devid)
+{
+ int err;
+ struct pnfs_filelayout_device fdev;
+ struct pnfs_filelayout_multipath fl_devices[1];
+ u32 fl_stripe_indices[1] = { 0 };
+ struct pnfs_filelayout_devaddr daddr;
+ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */
+ char daddr_buf[8*4 + 2*3 + 10];
+
+ dprintk("--> %s: sb=%p\n", __func__, sb);
+
+ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
+
+ memset(&fdev, '\0', sizeof(fdev));
+
+ if (devid->devid != 1) {
+ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 "
+ "(got: 0x%llx)\n", __func__, devid->devid);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* count the number of comma-delimited DS IPs */
+ fdev.fl_device_length = 1;
+ fdev.fl_device_list = fl_devices;
+
+ fdev.fl_stripeindices_length = fdev.fl_device_length;
+ fdev.fl_stripeindices_list = fl_stripe_indices;
+
+ daddr.r_addr.data = daddr_buf;
+ daddr.r_addr.len = sizeof(daddr_buf);
+ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr);
+ if (err < 0)
+ goto out;
+ daddr.r_addr.len = err;
+ switch (pnfsd_lexp_addr.sa_family) {
+ case AF_INET:
+ daddr.r_netid.data = "tcp";
+ daddr.r_netid.len = 3;
+ break;
+ case AF_INET6:
+ daddr.r_netid.data = "tcp6";
+ daddr.r_netid.len = 4;
+ break;
+ default:
+ BUG();
+ }
+ fdev.fl_device_list[0].fl_multipath_length = 1;
+ fdev.fl_device_list[0].fl_multipath_list = &daddr;
+
+ /* have nfsd encode the device info */
+ err = filelayout_encode_devinfo(xdr, &fdev);
+out:
+ dprintk("<-- %s: return %d\n", __func__, err);
+ return err;
+}
+
+static int get_stripe_unit(int blocksize)
+{
+ if (blocksize < NFSSVC_MAXBLKSIZE)
+ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
+ dprintk("%s: return %d\n", __func__, blocksize);
+ return blocksize;
+}
+
+static enum nfsstat4
+pnfsd_lexp_layout_get(struct inode *inode,
+ struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_layoutget_arg *arg,
+ struct nfsd4_pnfs_layoutget_res *res)
+{
+ enum nfsstat4 rc = NFS4_OK;
+ struct pnfs_filelayout_layout *layout = NULL;
+ struct knfsd_fh *fhp = NULL;
+
+ dprintk("--> %s: inode=%p\n", __func__, inode);
+
+ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
+ res->lg_seg.offset = 0;
+ res->lg_seg.length = NFS4_MAX_UINT64;
+
+ layout = kzalloc(sizeof(*layout), GFP_KERNEL);
+ if (layout == NULL) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ /* Set file layout response args */
+ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
+ layout->lg_stripe_type = STRIPE_SPARSE;
+ layout->lg_commit_through_mds = true;
+ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
+ layout->lg_fh_length = 1;
+ layout->device_id.sbid = arg->lg_sbid;
+ layout->device_id.devid = 1; /*FSFTEMP*/
+ layout->lg_first_stripe_index = 0; /*FSFTEMP*/
+ layout->lg_pattern_offset = 0;
+
+ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
+ if (fhp == NULL) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ memcpy(fhp, arg->lg_fh, sizeof(*fhp));
+ pnfs_fh_mark_ds(fhp);
+ layout->lg_fh_list = fhp;
+
+ /* Call nfsd to encode layout */
+ rc = filelayout_encode_layout(xdr, layout);
+exit:
+ kfree(layout);
+ kfree(fhp);
+ dprintk("<-- %s: return %d\n", __func__, rc);
+ return rc;
+
+error:
+ res->lg_seg.length = 0;
+ goto exit;
+}
+
+static int
+pnfsd_lexp_layout_commit(struct inode *inode,
+ const struct nfsd4_pnfs_layoutcommit_arg *args,
+ struct nfsd4_pnfs_layoutcommit_res *res)
+{
+ dprintk("%s: (unimplemented)\n", __func__);
+
+ return 0;
+}
+
+static int
+pnfsd_lexp_layout_return(struct inode *inode,
+ const struct nfsd4_pnfs_layoutreturn_arg *args)
+{
+ wake_up_all(&lo_recall_wq);
+ return 0;
+}
+
+static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh,
+ struct pnfs_get_state *p)
+{
+ return 0; /* just use the current stateid */
+}
+
+static struct pnfs_export_operations pnfsd_lexp_ops = {
+ .layout_type = pnfsd_lexp_layout_type,
+ .get_device_info = pnfsd_lexp_get_device_info,
+ .get_device_iter = pnfsd_lexp_get_device_iter,
+ .layout_get = pnfsd_lexp_layout_get,
+ .layout_commit = pnfsd_lexp_layout_commit,
+ .layout_return = pnfsd_lexp_layout_return,
+ .get_state = pnfsd_lexp_get_state,
+};
+
+void
+pnfsd_lexp_init(struct inode *inode)
+{
+ static bool init_once;
+
+ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops);
+ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops;
+
+ if (!init_once++)
+ init_waitqueue_head(&lo_recall_wq);
+}
+
+bool
+is_inode_pnfsd_lexp(struct inode *inode)
+{
+ return inode->i_sb->s_pnfs_op == &pnfsd_lexp_ops;
+}
+
+static bool
+has_layout(struct nfs4_file *fp)
+{
+ return !list_empty(&fp->fi_layouts);
+}
+
+/*
+ * recalls the layout if needed and waits synchronously for its return
+ */
+int
+pnfsd_lexp_recall_layout(struct inode *inode)
+{
+ struct nfs4_file *fp;
+ struct nfsd4_pnfs_cb_layout cbl;
+ int status = 0;
+
+ dprintk("%s: begin\n", __func__);
+ fp = find_file(inode);
+ BUG_ON(!fp);
+
+ if (!has_layout(fp))
+ goto out;
+
+ memset(&cbl, 0, sizeof(cbl));
+ cbl.cbl_recall_type = RETURN_FILE;
+ cbl.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES;
+ /* for now, always recall the whole layout */
+ cbl.cbl_seg.iomode = IOMODE_ANY;
+ cbl.cbl_seg.offset = 0;
+ cbl.cbl_seg.length = NFS4_MAX_UINT64;
+
+ while (has_layout(fp)) {
+ dprintk("%s: recalling layout\n", __func__);
+ status = nfsd_layout_recall_cb(inode->i_sb, inode, &cbl);
+
+ switch (status) {
+ case 0:
+ case -EAGAIN:
+ break;
+ case -ENOENT: /* no matching layout */
+ status = 0;
+ goto out;
+ default:
+ goto out;
+ }
+
+ dprintk("%s: waiting status=%d\n", __func__, status);
+ status = wait_event_interruptible(lo_recall_wq, !has_layout(fp));
+ if (status)
+ break;
+ }
+out:
+ put_nfs4_file(fp);
+ dprintk("%s: status=%d\n", __func__, status);
+ return status;
+}
diff -up linux-2.6.38.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.38.noarch/fs/nfsd/spnfs_com.c
--- linux-2.6.38.noarch/fs/nfsd/spnfs_com.c.orig 2011-03-26 07:57:44.301821082 -0400
+++ linux-2.6.38.noarch/fs/nfsd/spnfs_com.c 2011-03-26 07:57:44.301821082 -0400
@@ -0,0 +1,535 @@
+/*
+ * fs/nfsd/spnfs_com.c
+ *
+ * Communcation layer between spNFS kernel and userspace
+ * Based heavily on idmap.c
+ *
+ */
+
+/*
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfsd/debug.h>
+
+#include <linux/nfsd4_spnfs.h>
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+ char __user *, size_t);
+static ssize_t spnfs_pipe_downcall(struct file *, const char __user *,
+ size_t);
+static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *);
+
+static struct rpc_pipe_ops spnfs_upcall_ops = {
+ .upcall = spnfs_pipe_upcall,
+ .downcall = spnfs_pipe_downcall,
+ .destroy_msg = spnfs_pipe_destroy_msg,
+};
+
+/* evil global variable */
+struct spnfs *global_spnfs;
+struct spnfs_config *spnfs_config;
+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
+int spnfs_use_layoutsegments;
+uint64_t layoutsegment_size;
+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+
+/*
+ * Used by spnfs_enabled()
+ * Tracks if the subsystem has been initialized at some point. It doesn't
+ * matter if it's not currently initialized.
+ */
+static int spnfs_enabled_at_some_point;
+
+/* call this to start the ball rolling */
+/* code it like we're going to avoid the global variable in the future */
+int
+nfsd_spnfs_new(void)
+{
+ struct spnfs *spnfs = NULL;
+ struct path path;
+ struct nameidata nd;
+ int rc;
+
+ if (global_spnfs != NULL)
+ return -EEXIST;
+
+ path.mnt = rpc_get_mount();
+ if (IS_ERR(path.mnt))
+ return PTR_ERR(path.mnt);
+
+ /* FIXME: do not abuse rpc_pipefs/nfs */
+ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
+ if (rc)
+ goto err;
+
+ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL);
+ if (spnfs == NULL){
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs,
+ &spnfs_upcall_ops, 0);
+ if (IS_ERR(spnfs->spnfs_dentry)) {
+ rc = -EPIPE;
+ goto err;
+ }
+
+ mutex_init(&spnfs->spnfs_lock);
+ mutex_init(&spnfs->spnfs_plock);
+ init_waitqueue_head(&spnfs->spnfs_wq);
+
+ global_spnfs = spnfs;
+ spnfs_enabled_at_some_point = 1;
+
+ return 0;
+err:
+ rpc_put_mount();
+ kfree(spnfs);
+ return rc;
+}
+
+/* again, code it like we're going to remove the global variable */
+void
+nfsd_spnfs_delete(void)
+{
+ struct spnfs *spnfs = global_spnfs;
+
+ if (!spnfs)
+ return;
+ rpc_unlink(spnfs->spnfs_dentry);
+ rpc_put_mount();
+ global_spnfs = NULL;
+ kfree(spnfs);
+}
+
+/* RPC pipefs upcall/downcall routines */
+/* looks like this code is invoked by the rpc_pipe code */
+/* to handle upcalls on things we've queued elsewhere */
+/* See nfs_idmap_id for an exmaple of enqueueing */
+static ssize_t
+spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+ char __user *dst, size_t buflen)
+{
+ char *data = (char *)msg->data + msg->copied;
+ ssize_t mlen = msg->len - msg->copied;
+ ssize_t left;
+
+ if (mlen > buflen)
+ mlen = buflen;
+
+ left = copy_to_user(dst, data, mlen);
+ if (left < 0) {
+ msg->errno = left;
+ return left;
+ }
+ mlen -= left;
+ msg->copied += mlen;
+ msg->errno = 0;
+ return mlen;
+}
+
+static ssize_t
+spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
+ struct spnfs *spnfs = (struct spnfs *)rpci->private;
+ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im;
+ int ret;
+
+ if (mlen != sizeof(struct spnfs_msg))
+ return -ENOSPC;
+
+ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
+ if (im_in == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(im_in, src, mlen) != 0)
+ return -EFAULT;
+
+ mutex_lock(&spnfs->spnfs_plock);
+
+ ret = mlen;
+ im->im_status = im_in->im_status;
+ /* If we got an error, terminate now, and wake up pending upcalls */
+ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) {
+ wake_up(&spnfs->spnfs_wq);
+ goto out;
+ }
+
+ ret = -EINVAL;
+ /* Did we match the current upcall? */
+ /* DMXXX: do not understand the comment above, from original code */
+ /* DMXXX: when do we _not_ match the current upcall? */
+ /* DMXXX: anyway, let's to a simplistic check */
+ if (im_in->im_type == im->im_type) {
+ /* copy the response into the spnfs struct */
+ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res));
+ ret = mlen;
+ } else
+ dprintk("spnfs: downcall type != upcall type\n");
+
+
+ wake_up(&spnfs->spnfs_wq);
+/* DMXXX handle rval processing */
+out:
+ mutex_unlock(&spnfs->spnfs_plock);
+ kfree(im_in);
+ return ret;
+}
+
+static void
+spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct spnfs_msg *im = msg->data;
+ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im);
+
+ if (msg->errno >= 0)
+ return;
+ mutex_lock(&spnfs->spnfs_plock);
+ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */
+ wake_up(&spnfs->spnfs_wq);
+ mutex_unlock(&spnfs->spnfs_plock);
+}
+
+/* generic upcall. called by functions in spnfs_ops.c */
+int
+spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg,
+ union spnfs_msg_res *res)
+{
+ struct rpc_pipe_msg msg;
+ struct spnfs_msg *im;
+ DECLARE_WAITQUEUE(wq, current);
+ int ret = -EIO;
+ int rval;
+
+ im = &spnfs->spnfs_im;
+
+ mutex_lock(&spnfs->spnfs_lock);
+ mutex_lock(&spnfs->spnfs_plock);
+
+ memset(im, 0, sizeof(*im));
+ memcpy(im, upmsg, sizeof(*upmsg));
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = im;
+ msg.len = sizeof(*im);
+
+ add_wait_queue(&spnfs->spnfs_wq, &wq);
+ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg);
+ if (rval < 0) {
+ remove_wait_queue(&spnfs->spnfs_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&spnfs->spnfs_plock);
+ schedule();
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&spnfs->spnfs_wq, &wq);
+ mutex_lock(&spnfs->spnfs_plock);
+
+ if (im->im_status & SPNFS_STATUS_SUCCESS) {
+ /* copy our result from the upcall */
+ memcpy(res, &im->im_res, sizeof(*res));
+ ret = 0;
+ }
+
+out:
+ memset(im, 0, sizeof(*im));
+ mutex_unlock(&spnfs->spnfs_plock);
+ mutex_unlock(&spnfs->spnfs_lock);
+ return(ret);
+}
+
+/*
+ * This is used to determine if the spnfsd daemon has been started at
+ * least once since the system came up. This is used to by the export
+ * mechanism to decide if spnfs is in use.
+ *
+ * Returns non-zero if the spnfsd has initialized the communication pipe
+ * at least once.
+ */
+int spnfs_enabled(void)
+{
+ return spnfs_enabled_at_some_point;
+}
+
+#ifdef CONFIG_PROC_FS
+
+/*
+ * procfs virtual files for user/kernel space communication:
+ *
+ * ctl - currently just an on/off switch...can be expanded
+ * getfh - fd to fh conversion
+ * recall - recall a layout from the command line, for example:
+ * echo <path> > /proc/fs/spnfs/recall
+ * config - configuration info, e.g., stripe size, num ds, etc.
+ */
+
+/*-------------- start ctl -------------------------*/
+static ssize_t ctl_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ int cmd, rc;
+
+ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int)))
+ return -EFAULT;
+ if (cmd) {
+ rc = nfsd_spnfs_new();
+ if (rc != 0)
+ return rc;
+ } else
+ nfsd_spnfs_delete();
+
+ return count;
+}
+
+static const struct file_operations ctl_ops = {
+ .write = ctl_write,
+};
+/*-------------- end ctl ---------------------------*/
+
+/*-------------- start config -------------------------*/
+static ssize_t config_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ static struct spnfs_config cfg;
+
+ if (copy_from_user(&cfg, buf, count))
+ return -EFAULT;
+
+ spnfs_config = &cfg;
+ return 0;
+}
+
+static const struct file_operations config_ops = {
+ .write = config_write,
+};
+/*-------------- end config ---------------------------*/
+
+/*-------------- start getfh -----------------------*/
+static int getfh_open(struct inode *inode, struct file *file)
+{
+ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+ if (file->private_data == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static ssize_t getfh_read(struct file *file, char __user *buf, size_t count,
+ loff_t *offset)
+{
+ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh)))
+ return -EFAULT;
+
+ return count;
+}
+
+static ssize_t getfh_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ int fd;
+
+ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int)))
+ return -EFAULT;
+ if (spnfs_getfh(fd, file->private_data) != 0)
+ return -EIO;
+
+ return count;
+}
+
+static int getfh_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static const struct file_operations getfh_ops = {
+ .open = getfh_open,
+ .read = getfh_read,
+ .write = getfh_write,
+ .release = getfh_release,
+};
+/*-------------- end getfh ------------------------*/
+
+
+/*-------------- start recall layout --------------*/
+static ssize_t recall_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ char input[128];
+ char *path, *str, *p;
+ int rc;
+ u64 off = 0, len = 0;
+
+ if (count > 128)
+ return -EINVAL;
+
+ if (copy_from_user(input, buf, count))
+ return -EFAULT;
+
+ /* assumes newline-terminated path */
+ p = memchr(input, '\n', count);
+ if (p == NULL)
+ return -EINVAL;
+ *p = '\0';
+
+ /*
+ * Scan for path and, optionally, an offset and length
+ * of a layout segment to be recalled; if there are two
+ * fields, they're assumed to be path and offset.
+ */
+ p = input;
+ path = strsep(&p, " ");
+ if (path == NULL)
+ return -EINVAL;
+
+ str = strsep(&p, " ");
+ if (str != NULL) {
+ rc = strict_strtoull(str, 10, &off);
+ if (rc != 0)
+ return -EINVAL;
+
+ str = strsep(&p, " ");
+ if (str != NULL) {
+ rc = strict_strtoull(str, 10, &len);
+ if (rc != 0)
+ return -EINVAL;
+ }
+ }
+
+ rc = spnfs_test_layoutrecall(path, off, len);
+ if (rc != 0)
+ return rc;
+
+ return count;
+}
+
+static const struct file_operations recall_ops = {
+ .write = recall_write,
+};
+/*-------------- end recall layout --------------*/
+
+
+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
+/*-------------- start layoutseg -------------------------*/
+static ssize_t layoutseg_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ char cmd[3];
+
+ if (copy_from_user(cmd, buf, 1))
+ return -EFAULT;
+ if (cmd[0] == '0')
+ spnfs_use_layoutsegments = 0;
+ else
+ spnfs_use_layoutsegments = 1;
+
+ return count;
+}
+
+static const struct file_operations layoutseg_ops = {
+ .write = layoutseg_write,
+};
+/*-------------- end layoutseg ---------------------------*/
+
+/*-------------- start layoutsegsize -------------------------*/
+static ssize_t layoutsegsize_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ char cmd[50];
+
+ if (copy_from_user(cmd, buf, 49))
+ return -EFAULT;
+ layoutsegment_size = simple_strtoull(cmd, NULL, 10);
+
+ return count;
+}
+
+static const struct file_operations layoutsegsize_ops = {
+ .write = layoutsegsize_write,
+};
+/*-------------- end layoutsegsize ---------------------------*/
+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+
+int
+spnfs_init_proc(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = proc_mkdir("fs/spnfs", NULL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry->proc_fops = &ctl_ops;
+
+ entry = create_proc_entry("fs/spnfs/config", 0, NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry->proc_fops = &config_ops;
+
+ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry->proc_fops = &getfh_ops;
+
+ entry = create_proc_entry("fs/spnfs/recall", 0, NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry->proc_fops = &recall_ops;
+
+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
+ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry->proc_fops = &layoutseg_ops;
+
+ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry->proc_fops = &layoutsegsize_ops;
+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
diff -up linux-2.6.38.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.38.noarch/fs/nfsd/spnfs_ops.c
--- linux-2.6.38.noarch/fs/nfsd/spnfs_ops.c.orig 2011-03-26 07:57:44.302821074 -0400
+++ linux-2.6.38.noarch/fs/nfsd/spnfs_ops.c 2011-03-26 07:57:44.302821074 -0400
@@ -0,0 +1,878 @@
+/*
+ * fs/nfsd/spnfs_ops.c
+ *
+ * Communcation layer between spNFS kernel and userspace
+ *
+ */
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc. All Rights Reserved.
+
+Network Appliance provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfsd4_spnfs.h>
+#include <linux/nfsd/debug.h>
+#include <linux/nfsd/nfsd4_pnfs.h>
+#include <linux/nfsd/nfs4layoutxdr.h>
+
+#include "pnfsd.h"
+
+/* comment out CONFIG_SPNFS_TEST for non-test behaviour */
+/* #define CONFIG_SPNFS_TEST 1 */
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+/*
+ * The functions that are called from elsewhere in the kernel
+ * to perform tasks in userspace
+ *
+ */
+
+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
+extern int spnfs_use_layoutsegments;
+extern uint64_t layoutsegment_size;
+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+extern struct spnfs *global_spnfs;
+
+int
+spnfs_layout_type(struct super_block *sb)
+{
+ return LAYOUT_NFSV4_1_FILES;
+}
+
+enum nfsstat4
+spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_layoutget_arg *lg_arg,
+ struct nfsd4_pnfs_layoutget_res *lg_res)
+{
+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
+ struct spnfs_msg *im = NULL;
+ union spnfs_msg_res *res = NULL;
+ struct pnfs_filelayout_layout *flp = NULL;
+ int status, i;
+ enum nfsstat4 nfserr;
+
+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
+ if (im == NULL) {
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ goto layoutget_cleanup;
+ }
+
+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
+ if (res == NULL) {
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ goto layoutget_cleanup;
+ }
+
+ im->im_type = SPNFS_TYPE_LAYOUTGET;
+ im->im_args.layoutget_args.inode = inode->i_ino;
+ im->im_args.layoutget_args.generation = inode->i_generation;
+
+ /* call function to queue the msg for upcall */
+ if (spnfs_upcall(spnfs, im, res) != 0) {
+ dprintk("failed spnfs upcall: layoutget\n");
+ nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
+ goto layoutget_cleanup;
+ }
+ status = res->layoutget_res.status;
+ if (status != 0) {
+ /* FIXME? until user mode is fixed, translate system error */
+ switch (status) {
+ case -E2BIG:
+ case -ETOOSMALL:
+ nfserr = NFS4ERR_TOOSMALL;
+ break;
+ case -ENOMEM:
+ case -EAGAIN:
+ case -EINTR:
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ break;
+ case -ENOENT:
+ nfserr = NFS4ERR_BADLAYOUT;
+ break;
+ default:
+ nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
+ }
+ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n",
+ status, nfserr);
+ goto layoutget_cleanup;
+ }
+
+ lg_res->lg_return_on_close = 0;
+#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
+ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */
+ /* the amount requested by the client. */
+ if (spnfs_use_layoutsegments) {
+ if (layoutsegment_size != 0)
+ lg_res->lg_seg.length = layoutsegment_size;
+ } else
+ lg_res->lg_seg.length = NFS4_MAX_UINT64;
+#else
+ lg_res->lg_seg.length = NFS4_MAX_UINT64;
+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+
+ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL);
+ if (flp == NULL) {
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ goto layoutget_cleanup;
+ }
+ flp->device_id.sbid = lg_arg->lg_sbid;
+ flp->device_id.devid = res->layoutget_res.devid;
+ flp->lg_layout_type = 1; /* XXX */
+ flp->lg_stripe_type = res->layoutget_res.stripe_type;
+ flp->lg_commit_through_mds = 0;
+ flp->lg_stripe_unit = res->layoutget_res.stripe_size;
+ flp->lg_first_stripe_index = 0;
+ flp->lg_pattern_offset = 0;
+ flp->lg_fh_length = res->layoutget_res.stripe_count;
+
+ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh),
+ GFP_KERNEL);
+ if (flp->lg_fh_list == NULL) {
+ nfserr = NFS4ERR_LAYOUTTRYLATER;
+ goto layoutget_cleanup;
+ }
+ /*
+ * FIX: Doing an extra copy here. Should group res.flist's fh_len
+ * and fh_val into a knfsd_fh structure.
+ */
+ for (i = 0; i < flp->lg_fh_length; i++) {
+ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len;
+ memcpy(&flp->lg_fh_list[i].fh_base,
+ res->layoutget_res.flist[i].fh_val,
+ res->layoutget_res.flist[i].fh_len);
+ }
+
+ /* encode the layoutget body */
+ nfserr = filelayout_encode_layout(xdr, flp);
+
+layoutget_cleanup:
+ if (flp) {
+ if (flp->lg_fh_list)
+ kfree(flp->lg_fh_list);
+ kfree(flp);
+ }
+ kfree(im);
+ kfree(res);
+
+ return nfserr;
+}
+
+int
+spnfs_layoutcommit(void)
+{
+ return 0;
+}
+
+int
+spnfs_layoutreturn(struct inode *inode,
+ const struct nfsd4_pnfs_layoutreturn_arg *args)
+{
+ return 0;
+}
+
+int
+spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
+{
+ struct super_block *sb;
+ struct nfsd4_pnfs_cb_layout lr;
+
+ switch (type) {
+ case RETURN_FILE:
+ sb = inode->i_sb;
+ dprintk("%s: recalling layout for ino = %lu\n",
+ __func__, inode->i_ino);
+ break;
+ case RETURN_FSID:
+ sb = inode->i_sb;
+ dprintk("%s: recalling layout for fsid x (unimplemented)\n",
+ __func__);
+ return 0;
+ case RETURN_ALL:
+ /* XXX figure out how to get a sb since there's no inode ptr */
+ dprintk("%s: recalling all layouts (unimplemented)\n",
+ __func__);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+
+ lr.cbl_recall_type = type;
+ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES;
+ lr.cbl_seg.clientid = 0;
+ lr.cbl_seg.offset = offset;
+ lr.cbl_seg.length = len;
+ lr.cbl_seg.iomode = IOMODE_ANY;
+ lr.cbl_layoutchanged = 0;
+
+ nfsd_layout_recall_cb(sb, inode, &lr);
+
+ return 0;
+}
+
+
+int
+spnfs_test_layoutrecall(char *path, u64 offset, u64 len)
+{
+ struct nameidata nd;
+ struct inode *inode;
+ int type, rc;
+
+ dprintk("%s: path=%s, offset=%llu, len=%llu\n",
+ __func__, path, offset, len);
+
+ if (strcmp(path, "all") == 0) {
+ inode = NULL;
+ type = RETURN_ALL;
+ } else {
+ rc = path_lookup(path, 0, &nd);
+ if (rc != 0)
+ return -ENOENT;
+
+ /*
+ * XXX todo: add a RETURN_FSID scenario here...maybe if
+ * inode is a dir...
+ */
+
+ inode = nd.path.dentry->d_inode;
+ type = RETURN_FILE;
+ }
+
+ if (len == 0)
+ len = NFS4_MAX_UINT64;
+
+ rc = spnfs_layoutrecall(inode, type, offset, len);
+
+ if (type != RETURN_ALL)
+ path_put(&nd.path);
+ return rc;
+}
+
+int
+spnfs_getdeviceiter(struct super_block *sb,
+ u32 layout_type,
+ struct nfsd4_pnfs_dev_iter_res *gd_res)
+{
+ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */
+ struct spnfs_msg *im = NULL;
+ union spnfs_msg_res *res = NULL;
+ int status = 0;
+
+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
+ if (im == NULL) {
+ status = -ENOMEM;
+ goto getdeviceiter_out;
+ }
+
+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
+ if (res == NULL) {
+ status = -ENOMEM;
+ goto getdeviceiter_out;
+ }
+
+ im->im_type = SPNFS_TYPE_GETDEVICEITER;
+ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie;
+ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf;
+
+ /* call function to queue the msg for upcall */
+ status = spnfs_upcall(spnfs, im, res);
+ if (status != 0) {
+ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
+ status = -EIO;
+ goto getdeviceiter_out;
+ }
+ status = res->getdeviceiter_res.status;
+
+ if (res->getdeviceiter_res.eof)
+ gd_res->gd_eof = 1;
+ else {
+ gd_res->gd_devid = res->getdeviceiter_res.devid;
+ gd_res->gd_cookie = res->getdeviceiter_res.cookie;
+ gd_res->gd_verf = res->getdeviceiter_res.verf;
+ gd_res->gd_eof = 0;
+ }
+
+getdeviceiter_out:
+ kfree(im);
+ kfree(res);
+
+ return status;
+}
+
+#ifdef CONFIG_SPNFS_TEST
+/*
+ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the
+ * 1024 encoded stripe indices.
+ *
+ * Skip the devaddr4 length and encode the indicies count (1024) in the
+ * rq_res.head and set the rq_res.head length.
+ *
+ * Set the rq_res page_len to 4096 (for the 1024 stripe indices).
+ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the
+ * rq_res head to hold the rest of the getdeviceinfo return.
+ *
+ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and
+ * rq_respages[rq_resused] contains the rq_res.pages.
+ */
+static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info,
+ const struct pnfs_filelayout_device *fdev)
+{
+ struct nfsd4_compoundres *resp = info->resp;
+ struct svc_rqst *rqstp = resp->rqstp;
+ struct xdr_buf *xb = &resp->rqstp->rq_res;
+ __be32 *p;
+
+ p = nfsd4_xdr_reserve_space(resp, 8);
+ p++; /* Fill in length later */
+ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */
+ resp->p = p;
+
+ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base;
+ xb->pages = &rqstp->rq_respages[rqstp->rq_resused];
+ xb->page_base = 0;
+ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */
+ xb->tail[0].iov_base = resp->p;
+ resp->end = xb->head[0].iov_base + PAGE_SIZE;
+ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p;
+ return 0;
+}
+/*
+ * Return a stripeindices of length 1024 to test
+ * the pNFS client multipage getdeviceinfo implementation.
+ *
+ * Encode a page of stripe indices.
+ */
+static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev,
+ struct spnfs_device *dev,
+ struct pnfs_devinfo_arg *info)
+{
+ struct svc_rqst *rqstp = info->xdr.resp->rqstp;
+ __be32 *p;
+ int i, j = 0;
+
+ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]);
+ fldev->fl_stripeindices_length = 1024;
+ /* round-robin the data servers device index into the stripe indicie */
+ for (i = 0; i < 1024; i++) {
+ *p++ = cpu_to_be32(j);
+ if (j < dev->dscount - 1)
+ j++;
+ else
+ j = 0;
+ }
+ fldev->fl_stripeindices_list = NULL;
+}
+#endif /* CONFIG_SPNFS_TEST */
+
+int
+spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
+ u32 layout_type,
+ const struct nfsd4_pnfs_deviceid *devid)
+{
+ struct spnfs *spnfs = global_spnfs;
+ struct spnfs_msg *im = NULL;
+ union spnfs_msg_res *res = NULL;
+ struct spnfs_device *dev;
+ struct pnfs_filelayout_device *fldev = NULL;
+ struct pnfs_filelayout_multipath *mp = NULL;
+ struct pnfs_filelayout_devaddr *fldap = NULL;
+ int status = 0, i, len;
+
+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
+ if (im == NULL) {
+ status = -ENOMEM;
+ goto getdeviceinfo_out;
+ }
+
+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
+ if (res == NULL) {
+ status = -ENOMEM;
+ goto getdeviceinfo_out;
+ }
+
+ im->im_type = SPNFS_TYPE_GETDEVICEINFO;
+ /* XXX FIX: figure out what to do about fsid */
+ im->im_args.getdeviceinfo_args.devid = devid->devid;
+
+ /* call function to queue the msg for upcall */
+ status = spnfs_upcall(spnfs, im, res);
+ if (status != 0) {
+ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
+ status = -EIO;
+ goto getdeviceinfo_out;
+ }
+ status = res->getdeviceinfo_res.status;
+ if (status != 0)
+ goto getdeviceinfo_out;
+
+ dev = &res->getdeviceinfo_res.devinfo;
+
+ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */
+ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL);
+ if (fldev == NULL) {
+ status = -ENOMEM;
+ goto getdeviceinfo_out;
+ }
+
+ /*
+ * Stripe count is the same as data server count for our purposes
+ */
+ fldev->fl_stripeindices_length = dev->dscount;
+ fldev->fl_device_length = dev->dscount;
+
+ /* Set stripe indices */
+#ifdef CONFIG_SPNFS_TEST
+ spnfs_set_test_indices(fldev, dev, info);
+ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr;
+#else /* CONFIG_SPNFS_TEST */
+ fldev->fl_stripeindices_list =
+ kmalloc(fldev->fl_stripeindices_length * sizeof(u32),
+ GFP_KERNEL);
+ if (fldev->fl_stripeindices_list == NULL) {
+ status = -ENOMEM;
+ goto getdeviceinfo_out;
+ }
+ for (i = 0; i < fldev->fl_stripeindices_length; i++)
+ fldev->fl_stripeindices_list[i] = i;
+#endif /* CONFIG_SPNFS_TEST */
+
+ /*
+ * Set the device's data server addresses No multipath for spnfs,
+ * so mp length is always 1.
+ *
+ */
+ fldev->fl_device_list =
+ kmalloc(fldev->fl_device_length *
+ sizeof(struct pnfs_filelayout_multipath),
+ GFP_KERNEL);
+ if (fldev->fl_device_list == NULL) {
+ status = -ENOMEM;
+ goto getdeviceinfo_out;
+ }
+ for (i = 0; i < fldev->fl_device_length; i++) {
+ mp = &fldev->fl_device_list[i];
+ mp->fl_multipath_length = 1;
+ mp->fl_multipath_list =
+ kmalloc(sizeof(struct pnfs_filelayout_devaddr),
+ GFP_KERNEL);
+ if (mp->fl_multipath_list == NULL) {
+ status = -ENOMEM;
+ goto getdeviceinfo_out;
+ }
+ fldap = mp->fl_multipath_list;
+
+ /*
+ * Copy the netid into the device address, for example: "tcp"
+ */
+ len = strlen(dev->dslist[i].netid);
+ fldap->r_netid.data = kmalloc(len, GFP_KERNEL);
+ if (fldap->r_netid.data == NULL) {
+ status = -ENOMEM;
+ goto getdeviceinfo_out;
+ }
+ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len);
+ fldap->r_netid.len = len;
+
+ /*
+ * Copy the network address into the device address,
+ * for example: "10.35.9.16.08.01"
+ */
+ len = strlen(dev->dslist[i].addr);
+ fldap->r_addr.data = kmalloc(len, GFP_KERNEL);
+ if (fldap->r_addr.data == NULL) {
+ status = -ENOMEM;
+ goto getdeviceinfo_out;
+ }
+ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len);
+ fldap->r_addr.len = len;
+ }
+
+ /* encode the device data */
+ status = filelayout_encode_devinfo(xdr, fldev);
+
+getdeviceinfo_out:
+ if (fldev) {
+ kfree(fldev->fl_stripeindices_list);
+ if (fldev->fl_device_list) {
+ for (i = 0; i < fldev->fl_device_length; i++) {
+ fldap =
+ fldev->fl_device_list[i].fl_multipath_list;
+ kfree(fldap->r_netid.data);
+ kfree(fldap->r_addr.data);
+ kfree(fldap);
+ }
+ kfree(fldev->fl_device_list);
+ }
+ kfree(fldev);
+ }
+
+ kfree(im);
+ kfree(res);
+
+ return status;
+}
+
+int
+spnfs_setattr(void)
+{
+ return 0;
+}
+
+int
+spnfs_open(struct inode *inode, struct nfsd4_open *open)
+{
+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
+ struct spnfs_msg *im = NULL;
+ union spnfs_msg_res *res = NULL;
+ int status = 0;
+
+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
+ if (im == NULL) {
+ status = -ENOMEM;
+ goto open_out;
+ }
+
+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
+ if (res == NULL) {
+ status = -ENOMEM;
+ goto open_out;
+ }
+
+ im->im_type = SPNFS_TYPE_OPEN;
+ im->im_args.open_args.inode = inode->i_ino;
+ im->im_args.open_args.generation = inode->i_generation;
+ im->im_args.open_args.create = open->op_create;
+ im->im_args.open_args.createmode = open->op_createmode;
+ im->im_args.open_args.truncate = open->op_truncate;
+
+ /* call function to queue the msg for upcall */
+ status = spnfs_upcall(spnfs, im, res);
+ if (status != 0) {
+ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
+ status = -EIO;
+ goto open_out;
+ }
+ status = res->open_res.status;
+
+open_out:
+ kfree(im);
+ kfree(res);
+
+ return status;
+}
+
+int
+spnfs_create(void)
+{
+ return 0;
+}
+
+/*
+ * Invokes the spnfsd with the inode number of the object to remove.
+ * The file has already been removed on the MDS, so all the spnsfd
+ * daemon does is remove the stripes.
+ * Returns 0 on success otherwise error code
+ */
+int
+spnfs_remove(unsigned long ino, unsigned long generation)
+{
+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
+ struct spnfs_msg *im = NULL;
+ union spnfs_msg_res *res = NULL;
+ int status = 0;
+
+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
+ if (im == NULL) {
+ status = -ENOMEM;
+ goto remove_out;
+ }
+
+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
+ if (res == NULL) {
+ status = -ENOMEM;
+ goto remove_out;
+ }
+
+ im->im_type = SPNFS_TYPE_REMOVE;
+ im->im_args.remove_args.inode = ino;
+ im->im_args.remove_args.generation = generation;
+
+ /* call function to queue the msg for upcall */
+ status = spnfs_upcall(spnfs, im, res);
+ if (status != 0) {
+ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
+ status = -EIO;
+ goto remove_out;
+ }
+ status = res->remove_res.status;
+
+remove_out:
+ kfree(im);
+ kfree(res);
+
+ return status;
+}
+
+static int
+read_one(struct inode *inode, loff_t offset, size_t len, char *buf,
+ struct file **filp)
+{
+ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
+ size_t iolen;
+ int completed = 0, ds, err;
+
+ while (len > 0) {
+ tmp = offset;
+ soff = do_div(tmp, spnfs_config->stripe_size);
+ snum = tmp;
+ ds = do_div(tmp, spnfs_config->num_ds);
+ if (spnfs_config->dense_striping == 0)
+ soffset = offset;
+ else {
+ tmp = snum;
+ do_div(tmp, spnfs_config->num_ds);
+ soffset = tmp * spnfs_config->stripe_size + soff;
+ }
+ if (len < spnfs_config->stripe_size - soff)
+ iolen = len;
+ else
+ iolen = spnfs_config->stripe_size - soff;
+
+ pos = soffset;
+ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos);
+ if (err < 0)
+ return -EIO;
+ if (err == 0)
+ break;
+ filp[ds]->f_pos = pos;
+ iolen = err;
+ completed += iolen;
+ len -= iolen;
+ offset += iolen;
+ bufoffset += iolen;
+ }
+
+ return completed;
+}
+
+static __be32
+read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
+ struct svc_rqst *rqstp)
+{
+ int i, vnum, err, bytecount = 0;
+ char path[128];
+ struct file *filp[SPNFS_MAX_DATA_SERVERS];
+ size_t iolen;
+ __be32 status = nfs_ok;
+
+ /*
+ * XXX We should just be doing this at open time, but it gets
+ * kind of messy storing this info in nfsd's state structures
+ * and piggybacking its path through the various state handling
+ * functions. Revisit this.
+ */
+ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
+ for (i = 0; i < spnfs_config->num_ds; i++) {
+ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
+ inode->i_ino, inode->i_generation);
+ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0);
+ if (filp[i] == NULL) {
+ status = nfserr_io;
+ goto read_out;
+ }
+ get_file(filp[i]);
+ }
+
+ for (vnum = 0 ; vnum < vlen ; vnum++) {
+ iolen = rqstp->rq_vec[vnum].iov_len;
+ err = read_one(inode, offset + bytecount, iolen,
+ (char *)rqstp->rq_vec[vnum].iov_base, filp);
+ if (err < 0) {
+ status = nfserr_io;
+ goto read_out;
+ }
+ if (err < iolen) {
+ bytecount += err;
+ goto read_out;
+ }
+ bytecount += rqstp->rq_vec[vnum].iov_len;
+ }
+
+read_out:
+ *lenp = bytecount;
+ for (i = 0; i < spnfs_config->num_ds; i++) {
+ if (filp[i]) {
+ filp_close(filp[i], current->files);
+ fput(filp[i]);
+ }
+ }
+ return status;
+}
+
+__be32
+spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
+ struct svc_rqst *rqstp)
+{
+ if (spnfs_config)
+ return read(inode, offset, lenp, vlen, rqstp);
+ else {
+ printk(KERN_ERR "Please upgrade to latest spnfsd\n");
+ return nfserr_notsupp;
+ }
+}
+
+static int
+write_one(struct inode *inode, loff_t offset, size_t len, char *buf,
+ struct file **filp)
+{
+ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
+ size_t iolen;
+ int completed = 0, ds, err;
+
+ while (len > 0) {
+ tmp = offset;
+ soff = do_div(tmp, spnfs_config->stripe_size);
+ snum = tmp;
+ ds = do_div(tmp, spnfs_config->num_ds);
+ if (spnfs_config->dense_striping == 0)
+ soffset = offset;
+ else {
+ tmp = snum;
+ do_div(tmp, spnfs_config->num_ds);
+ soffset = tmp * spnfs_config->stripe_size + soff;
+ }
+ if (len < spnfs_config->stripe_size - soff)
+ iolen = len;
+ else
+ iolen = spnfs_config->stripe_size - soff;
+
+ pos = soffset;
+ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos);
+ if (err < 0)
+ return -EIO;
+ filp[ds]->f_pos = pos;
+ iolen = err;
+ completed += iolen;
+ len -= iolen;
+ offset += iolen;
+ bufoffset += iolen;
+ }
+
+ return completed;
+}
+
+static __be32
+write(struct inode *inode, loff_t offset, size_t len, int vlen,
+ struct svc_rqst *rqstp)
+{
+ int i, vnum, err, bytecount = 0;
+ char path[128];
+ struct file *filp[SPNFS_MAX_DATA_SERVERS];
+ size_t iolen;
+ __be32 status = nfs_ok;
+
+ /*
+ * XXX We should just be doing this at open time, but it gets
+ * kind of messy storing this info in nfsd's state structures
+ * and piggybacking its path through the various state handling
+ * functions. Revisit this.
+ */
+ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
+ for (i = 0; i < spnfs_config->num_ds; i++) {
+ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
+ inode->i_ino, inode->i_generation);
+ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0);
+ if (filp[i] == NULL) {
+ status = nfserr_io;
+ goto write_out;
+ }
+ get_file(filp[i]);
+ }
+
+ for (vnum = 0; vnum < vlen; vnum++) {
+ iolen = rqstp->rq_vec[vnum].iov_len;
+ err = write_one(inode, offset + bytecount, iolen,
+ (char *)rqstp->rq_vec[vnum].iov_base, filp);
+ if (err != iolen) {
+ dprintk("spnfs_write: err=%d expected %Zd\n", err, len);
+ status = nfserr_io;
+ goto write_out;
+ }
+ bytecount += rqstp->rq_vec[vnum].iov_len;
+ }
+
+write_out:
+ for (i = 0; i < spnfs_config->num_ds; i++) {
+ if (filp[i]) {
+ filp_close(filp[i], current->files);
+ fput(filp[i]);
+ }
+ }
+
+ return status;
+}
+
+__be32
+spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen,
+ struct svc_rqst *rqstp)
+{
+ if (spnfs_config)
+ return write(inode, offset, len, vlen, rqstp);
+ else {
+ printk(KERN_ERR "Please upgrade to latest spnfsd\n");
+ return nfserr_notsupp;
+ }
+}
+
+int
+spnfs_commit(void)
+{
+ return 0;
+}
+
+/*
+ * Return the state for this object.
+ * At this time simply return 0 to indicate success and use the existing state
+ */
+int
+spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg)
+{
+ return 0;
+}
+
+/*
+ * Return the filehandle for the specified file descriptor
+ */
+int
+spnfs_getfh(int fd, struct nfs_fh *fh)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (file == NULL)
+ return -EIO;
+
+ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh));
+ fput(file);
+ return 0;
+}
diff -up linux-2.6.38.noarch/fs/nfsd/state.h.orig linux-2.6.38.noarch/fs/nfsd/state.h
--- linux-2.6.38.noarch/fs/nfsd/state.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/state.h 2011-03-26 07:57:44.303821066 -0400
@@ -37,6 +37,7 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/nfsd/nfsfh.h>
+#include <linux/nfsd/export.h>
#include "nfsfh.h"
typedef struct {
@@ -65,17 +66,6 @@ typedef struct {
(s)->si_fileid, \
(s)->si_generation
-struct nfsd4_callback {
- void *cb_op;
- struct nfs4_client *cb_clp;
- struct list_head cb_per_client;
- u32 cb_minorversion;
- struct rpc_message cb_msg;
- const struct rpc_call_ops *cb_ops;
- struct work_struct cb_work;
- bool cb_done;
-};
-
struct nfs4_delegation {
struct list_head dl_perfile;
struct list_head dl_perclnt;
@@ -245,6 +235,7 @@ struct nfs4_client {
#define NFSD4_CB_UP 0
#define NFSD4_CB_UNKNOWN 1
#define NFSD4_CB_DOWN 2
+#define NFSD4_CB_FAULT 3
int cl_cb_state;
struct nfsd4_callback cl_cb_null;
struct nfsd4_session *cl_cb_session;
@@ -265,6 +256,12 @@ struct nfs4_client {
unsigned long cl_cb_slot_busy;
struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
/* wait here for slots */
+#if defined(CONFIG_PNFSD)
+ struct list_head cl_layouts; /* outstanding layouts */
+ struct list_head cl_layoutrecalls; /* outstanding layoutrecall
+ callbacks */
+ atomic_t cl_deviceref; /* Num outstanding devs */
+#endif /* CONFIG_PNFSD */
};
static inline void
@@ -384,6 +381,14 @@ struct nfs4_file {
u32 fi_id; /* used with stateowner->so_id
* for stateid_hashtbl hash */
bool fi_had_conflict;
+#if defined(CONFIG_PNFSD)
+ struct list_head fi_layouts;
+ struct list_head fi_layout_states;
+ /* used by layoutget / layoutrecall */
+ struct nfs4_fsid fi_fsid;
+ u32 fi_fhlen;
+ u8 fi_fhval[NFS4_FHSIZE];
+#endif /* CONFIG_PNFSD */
};
/* XXX: for first cut may fall back on returning file that doesn't work
@@ -412,6 +417,15 @@ static inline struct file *find_any_file
return f->fi_fds[O_RDONLY];
}
+#if defined(CONFIG_PNFSD)
+/* pNFS Metadata server state */
+
+struct pnfs_ds_dev_entry {
+ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */
+ u32 dd_dsid;
+};
+#endif /* CONFIG_PNFSD */
+
/*
* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
*
@@ -434,6 +448,9 @@ struct nfs4_stateid {
struct list_head st_perfile;
struct list_head st_perstateowner;
struct list_head st_lockowners;
+#if defined(CONFIG_PNFSD)
+ struct list_head st_pnfs_ds_id;
+#endif /* CONFIG_PNFSD */
struct nfs4_stateowner * st_stateowner;
struct nfs4_file * st_file;
stateid_t st_stateid;
@@ -486,6 +503,34 @@ extern void nfsd4_recdir_purge_old(void)
extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
extern void release_session_client(struct nfsd4_session *);
+extern void nfsd4_free_slab(struct kmem_cache **);
+extern struct nfs4_file *find_file(struct inode *);
+extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *);
+extern void put_nfs4_file(struct nfs4_file *);
+extern void get_nfs4_file(struct nfs4_file *);
+extern struct nfs4_client *find_confirmed_client(clientid_t *);
+extern struct nfs4_stateid *find_stateid(stateid_t *, int flags);
+extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *);
+extern __be32 nfs4_check_stateid(stateid_t *);
+extern void expire_client_lock(struct nfs4_client *);
+extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *);
+
+#if defined(CONFIG_PNFSD)
+extern int nfsd4_init_pnfs_slabs(void);
+extern void nfsd4_free_pnfs_slabs(void);
+extern void pnfs_expire_client(struct nfs4_client *);
+extern void release_pnfs_ds_dev_list(struct nfs4_stateid *);
+extern void nfs4_pnfs_state_init(void);
+extern void nfs4_pnfs_state_shutdown(void);
+extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
+extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *);
+#else /* CONFIG_PNFSD */
+static inline void nfsd4_free_pnfs_slabs(void) {}
+static inline int nfsd4_init_pnfs_slabs(void) { return 0; }
+static inline void pnfs_expire_client(struct nfs4_client *clp) {}
+static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {}
+static inline void nfs4_pnfs_state_shutdown(void) {}
+#endif /* CONFIG_PNFSD */
static inline void
nfs4_put_stateowner(struct nfs4_stateowner *so)
@@ -499,4 +544,24 @@ nfs4_get_stateowner(struct nfs4_stateown
kref_get(&so->so_ref);
}
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
#endif /* NFSD4_STATE_H */
diff -up linux-2.6.38.noarch/fs/nfsd/vfs.c.orig linux-2.6.38.noarch/fs/nfsd/vfs.c
--- linux-2.6.38.noarch/fs/nfsd/vfs.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/vfs.c 2011-03-26 07:57:44.304821057 -0400
@@ -36,7 +36,12 @@
#ifdef CONFIG_NFSD_V4
#include "acl.h"
#include "idmap.h"
+#include "pnfsd.h"
+#include <linux/nfsd4_spnfs.h>
#endif /* CONFIG_NFSD_V4 */
+#if defined(CONFIG_SPNFS_BLOCK)
+#include <linux/nfsd4_block.h>
+#endif
#include "nfsd.h"
#include "vfs.h"
@@ -380,6 +385,16 @@ nfsd_setattr(struct svc_rqst *rqstp, str
NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
+ if (is_inode_pnfsd_lexp(inode))
+ pnfsd_lexp_recall_layout(inode);
+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
+#if defined(CONFIG_SPNFS_BLOCK)
+ if (pnfs_block_enabled(inode, 0)) {
+ err = bl_layoutrecall(inode, RETURN_FILE,
+ iap->ia_size, inode->i_size - iap->ia_size);
+ }
+#endif /* CONFIG_SPNFS_BLOCK */
}
host_err = get_write_access(inode);
@@ -1685,6 +1700,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru
struct inode *fdir, *tdir;
__be32 err;
int host_err;
+#ifdef CONFIG_SPNFS
+ unsigned long ino = 0;
+ unsigned long generation = 0;
+ unsigned int nlink = 0;
+#endif /* CONFIG_SPNFS */
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
if (err)
@@ -1751,7 +1771,27 @@ nfsd_rename(struct svc_rqst *rqstp, stru
}
if (host_err)
goto out_drop_write;
+
+#ifdef CONFIG_SPNFS
+ /*
+ * if the target is a preexisting regular file, remember the
+ * inode number and generation so we can delete the stripes;
+ * save the link count as well so that the stripes only get
+ * get deleted when the last link is deleted
+ */
+ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) {
+ ino = ndentry->d_inode->i_ino;
+ generation = ndentry->d_inode->i_generation;
+ nlink = ndentry->d_inode->i_nlink;
+ }
+#endif /* CONFIG_SPNFS */
+
host_err = vfs_rename(fdir, odentry, tdir, ndentry);
+#ifdef CONFIG_SPNFS
+ if (spnfs_enabled() && (!host_err && ino && nlink == 1))
+ spnfs_remove(ino, generation);
+#endif /* CONFIG_SPNFS */
+
if (!host_err) {
host_err = commit_metadata(tfhp);
if (!host_err)
@@ -1791,6 +1831,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru
struct inode *dirp;
__be32 err;
int host_err;
+#if defined(CONFIG_SPNFS)
+ unsigned long ino;
+ unsigned long generation;
+ unsigned int nlink;
+#endif /* defined(CONFIG_SPNFS) */
err = nfserr_acces;
if (!flen || isdotent(fname, flen))
@@ -1814,6 +1859,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru
goto out;
}
+#if defined(CONFIG_SPNFS)
+ /*
+ * Remember the inode number to communicate to the spnfsd
+ * for removal of stripes; save the link count as well so that
+ * the stripes only get get deleted when the last link is deleted
+ */
+ ino = rdentry->d_inode->i_ino;
+ generation = rdentry->d_inode->i_generation;
+ nlink = rdentry->d_inode->i_nlink;
+#endif /* defined(CONFIG_SPNFS) */
+
if (!type)
type = rdentry->d_inode->i_mode & S_IFMT;
@@ -1830,6 +1886,26 @@ nfsd_unlink(struct svc_rqst *rqstp, stru
host_err = vfs_rmdir(dirp, rdentry);
if (!host_err)
host_err = commit_metadata(fhp);
+
+#if defined(CONFIG_SPNFS)
+ /*
+ * spnfs: notify spnfsd of removal to destroy stripes
+ */
+ dprintk("%s check if spnfs_enabled\n", __FUNCTION__);
+ if (spnfs_enabled() && nlink == 1) {
+ BUG_ON(ino == 0);
+ dprintk("%s calling spnfs_remove inumber=%ld\n",
+ __FUNCTION__, ino);
+ if (spnfs_remove(ino, generation) == 0) {
+ dprintk("%s spnfs_remove success\n", __FUNCTION__);
+ } else {
+ /* XXX How do we make this atomic? */
+ printk(KERN_WARNING "nfsd: pNFS could not "
+ "remove stripes for inode: %ld\n", ino);
+ }
+ }
+#endif /* defined(CONFIG_SPNFS) */
+
out_drop_write:
mnt_drop_write(fhp->fh_export->ex_path.mnt);
out_put:
diff -up linux-2.6.38.noarch/fs/nfsd/xdr4.h.orig linux-2.6.38.noarch/fs/nfsd/xdr4.h
--- linux-2.6.38.noarch/fs/nfsd/xdr4.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfsd/xdr4.h 2011-03-26 07:57:44.305821048 -0400
@@ -37,6 +37,8 @@
#ifndef _LINUX_NFSD_XDR4_H
#define _LINUX_NFSD_XDR4_H
+#include <linux/nfsd/nfsd4_pnfs.h>
+
#include "state.h"
#include "nfsd.h"
@@ -390,6 +392,51 @@ struct nfsd4_reclaim_complete {
u32 rca_one_fs;
};
+struct nfsd4_pnfs_getdevinfo {
+ struct nfsd4_pnfs_deviceid gd_devid; /* request */
+ u32 gd_layout_type; /* request */
+ u32 gd_maxcount; /* request */
+ u32 gd_notify_types;/* request */
+ struct super_block *gd_sb;
+};
+
+struct nfsd4_pnfs_getdevlist {
+ u32 gd_layout_type; /* request */
+ u32 gd_maxdevices; /* request */
+ u64 gd_cookie; /* request - response */
+ u64 gd_verf; /* request - response */
+ struct svc_fh *gd_fhp; /* response */
+ u32 gd_eof; /* response */
+};
+
+struct nfsd4_pnfs_layoutget {
+ u64 lg_minlength; /* request */
+ u32 lg_signal; /* request */
+ u32 lg_maxcount; /* request */
+ struct svc_fh *lg_fhp; /* request */
+ stateid_t lg_sid; /* request/response */
+ struct nfsd4_layout_seg lg_seg; /* request/response */
+ u32 lg_roc; /* response */
+};
+
+struct nfsd4_pnfs_layoutcommit {
+ struct nfsd4_pnfs_layoutcommit_arg args;
+ stateid_t lc_sid; /* request */
+ struct nfsd4_pnfs_layoutcommit_res res;
+};
+
+enum layoutreturn_flags {
+ LR_FLAG_INTERN = 1 << 0, /* internal return */
+ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */
+};
+
+struct nfsd4_pnfs_layoutreturn {
+ struct nfsd4_pnfs_layoutreturn_arg args;
+ u32 lr_flags;
+ stateid_t lr_sid; /* request/resopnse */
+ u32 lrs_present; /* response */
+};
+
struct nfsd4_op {
int opnum;
__be32 status;
@@ -432,6 +479,13 @@ struct nfsd4_op {
struct nfsd4_destroy_session destroy_session;
struct nfsd4_sequence sequence;
struct nfsd4_reclaim_complete reclaim_complete;
+#if defined(CONFIG_PNFSD)
+ struct nfsd4_pnfs_getdevlist pnfs_getdevlist;
+ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo;
+ struct nfsd4_pnfs_layoutget pnfs_layoutget;
+ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit;
+ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn;
+#endif /* CONFIG_PNFSD */
} u;
struct nfs4_replay * replay;
};
diff -up linux-2.6.38.noarch/fs/nfs/file.c.orig linux-2.6.38.noarch/fs/nfs/file.c
--- linux-2.6.38.noarch/fs/nfs/file.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/file.c 2011-03-26 07:57:44.247821541 -0400
@@ -381,16 +381,16 @@ static int nfs_write_begin(struct file *
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
struct page *page;
int once_thru = 0;
+ struct pnfs_layout_segment *lseg;
dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
- pnfs_update_layout(mapping->host,
- nfs_file_open_context(file),
- IOMODE_RW);
-
+ lseg = pnfs_update_layout(mapping->host,
+ nfs_file_open_context(file),
+ pos, len, IOMODE_RW);
start:
/*
* Prevent starvation issues if someone is doing a consistency
@@ -399,17 +399,22 @@ start:
ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
- return ret;
+ goto out;
page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
*pagep = page;
- ret = nfs_flush_incompatible(file, page);
+ ret = nfs_flush_incompatible(file, page, lseg);
if (ret) {
unlock_page(page);
page_cache_release(page);
+ *pagep = NULL;
+ *fsdata = NULL;
+ goto out;
} else if (!once_thru &&
nfs_want_read_modify_write(file, page, pos, len)) {
once_thru = 1;
@@ -418,6 +423,12 @@ start:
if (!ret)
goto start;
}
+ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata);
+ out:
+ if (ret) {
+ put_lseg(lseg);
+ *fsdata = NULL;
+ }
return ret;
}
@@ -427,6 +438,7 @@ static int nfs_write_end(struct file *fi
{
unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
int status;
+ struct pnfs_layout_segment *lseg;
dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
@@ -453,10 +465,17 @@ static int nfs_write_end(struct file *fi
zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
}
- status = nfs_updatepage(file, page, offset, copied);
+ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata);
+ status = pnfs_write_end(file, page, pos, len, copied, lseg);
+ if (status)
+ goto out;
+ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata);
+ out:
unlock_page(page);
page_cache_release(page);
+ pnfs_write_end_cleanup(file, fsdata);
+ put_lseg(lseg);
if (status < 0)
return status;
@@ -567,6 +586,8 @@ static int nfs_vm_page_mkwrite(struct vm
/* make sure the cache has finished storing the page */
nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+ /* XXX Do we want to call pnfs_update_layout here? */
+
lock_page(page);
mapping = page->mapping;
if (mapping != dentry->d_inode->i_mapping)
@@ -577,8 +598,8 @@ static int nfs_vm_page_mkwrite(struct vm
goto out_unlock;
ret = VM_FAULT_LOCKED;
- if (nfs_flush_incompatible(filp, page) == 0 &&
- nfs_updatepage(filp, page, 0, pagelen) == 0)
+ if (nfs_flush_incompatible(filp, page, NULL) == 0 &&
+ nfs_updatepage(filp, page, 0, pagelen, NULL, NULL) == 0)
goto out;
ret = VM_FAULT_SIGBUS;
diff -up linux-2.6.38.noarch/fs/nfs/inode.c.orig linux-2.6.38.noarch/fs/nfs/inode.c
--- linux-2.6.38.noarch/fs/nfs/inode.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/inode.c 2011-03-26 07:57:44.248821533 -0400
@@ -653,6 +653,7 @@ struct nfs_open_context *get_nfs_open_co
atomic_inc(&ctx->lock_context.count);
return ctx;
}
+EXPORT_SYMBOL(get_nfs_open_context);
static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
{
@@ -1016,6 +1017,7 @@ void nfs_fattr_init(struct nfs_fattr *fa
fattr->time_start = jiffies;
fattr->gencount = nfs_inc_attr_generation_counter();
}
+EXPORT_SYMBOL(nfs_fattr_init);
struct nfs_fattr *nfs_alloc_fattr(void)
{
@@ -1225,6 +1227,14 @@ static int nfs_update_inode(struct inode
server->fsid = fattr->fsid;
/*
+ * file needs layout commit, server attributes may be stale
+ */
+ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) {
+ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n",
+ __func__, inode->i_sb->s_id, inode->i_ino);
+ return 0;
+ }
+ /*
* Update the read time so we don't revalidate too often.
*/
nfsi->read_cache_jiffies = fattr->time_start;
@@ -1423,9 +1433,10 @@ static int nfs_update_inode(struct inode
*/
void nfs4_evict_inode(struct inode *inode)
{
- pnfs_destroy_layout(NFS_I(inode));
+ pnfs_return_layout(inode, NULL, true);
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
+ pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
/* First call standard NFS clear_inode() code */
diff -up linux-2.6.38.noarch/fs/nfs/internal.h.orig linux-2.6.38.noarch/fs/nfs/internal.h
--- linux-2.6.38.noarch/fs/nfs/internal.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/internal.h 2011-03-26 07:57:44.249821524 -0400
@@ -148,6 +148,16 @@ extern struct nfs_server *nfs_clone_serv
struct nfs_fattr *);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+ const struct sockaddr *sa2);
+extern int nfs4_set_client(struct nfs_server *server,
+ const char *hostname,
+ const struct sockaddr *addr,
+ const size_t addrlen,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int proto, const struct rpc_timeout *timeparms,
+ u32 minorversion);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
@@ -213,6 +223,8 @@ extern const u32 nfs41_maxwrite_overhead
extern struct rpc_procinfo nfs4_procedures[];
#endif
+extern int nfs4_recover_expired_lease(struct nfs_client *clp);
+
/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
@@ -262,10 +274,31 @@ extern int nfs4_get_rootfh(struct nfs_se
#endif
/* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops);
+extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops);
extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
/* write.c */
+extern int nfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how);
+extern int pnfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how);
+extern int nfs_initiate_commit(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how);
+extern int pnfs_initiate_commit(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how, int pnfs);
extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+extern void nfs_mark_list_commit(struct list_head *head);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
struct page *, struct page *);
diff -up linux-2.6.38.noarch/fs/nfs/Kconfig.orig linux-2.6.38.noarch/fs/nfs/Kconfig
--- linux-2.6.38.noarch/fs/nfs/Kconfig.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/Kconfig 2011-03-26 07:57:44.233821664 -0400
@@ -87,6 +87,34 @@ config NFS_V4_1
config PNFS_FILE_LAYOUT
tristate
+config PNFS_OBJLAYOUT
+ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+ help
+ Say M here if you want your pNFS client to support the Objects Layout Driver.
+ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
+ upper level driver (SCSI_OSD_ULD).
+
+ If unsure, say N.
+
+config PNFS_PANLAYOUT
+ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+ depends on PNFS_OBJLAYOUT
+ help
+ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver.
+
+ If unsure, say N.
+
+config PNFS_BLOCK
+ tristate "Provide a pNFS block client (EXPERIMENTAL)"
+ depends on NFS_FS && NFS_V4_1
+ select MD
+ select BLK_DEV_DM
+ help
+ Say M or y here if you want your pNfs client to support the block protocol
+
+ If unsure, say N.
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
diff -up linux-2.6.38.noarch/fs/nfs/Makefile.orig linux-2.6.38.noarch/fs/nfs/Makefile
--- linux-2.6.38.noarch/fs/nfs/Makefile.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/Makefile 2011-03-26 07:57:44.234821653 -0400
@@ -21,3 +21,6 @@ nfs-$(CONFIG_NFS_FSCACHE) += fscache.o f
obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff -up linux-2.6.38.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.38.noarch/fs/nfs/nfs4filelayout.c
--- linux-2.6.38.noarch/fs/nfs/nfs4filelayout.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/nfs4filelayout.c 2011-03-26 07:57:44.251821506 -0400
@@ -41,7 +41,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz
MODULE_DESCRIPTION("The NFSv4 file layout driver");
static int
-filelayout_set_layoutdriver(struct nfs_server *nfss)
+filelayout_set_layoutdriver(struct nfs_server *nfss, const struct nfs_fh *mntfh)
{
int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
nfs4_fl_free_deviceid_callback);
@@ -66,6 +66,200 @@ filelayout_clear_layoutdriver(struct nfs
return 0;
}
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ switch (flseg->stripe_type) {
+ case STRIPE_SPARSE:
+ return offset;
+
+ case STRIPE_DENSE:
+ {
+ u32 stripe_width;
+ u64 tmp, off;
+ u32 unit = flseg->stripe_unit;
+
+ stripe_width = unit * flseg->dsaddr->stripe_count;
+ tmp = off = offset - flseg->pattern_offset;
+ do_div(tmp, stripe_width);
+ return tmp * unit + do_div(off, unit);
+ }
+ default:
+ BUG();
+ }
+
+ /* We should never get here... just to stop the gcc warning */
+ return 0;
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ if (rdata->fldata.orig_offset) {
+ dprintk("%s new off %llu orig offset %llu\n", __func__,
+ rdata->args.offset, rdata->fldata.orig_offset);
+ rdata->args.offset = rdata->fldata.orig_offset;
+ }
+
+ /* Note this may cause RPC to be resent */
+ rdata->pdata.call_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_release(void *data)
+{
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ put_lseg(rdata->pdata.lseg);
+ rdata->pdata.lseg = NULL;
+ rdata->pdata.call_ops->rpc_release(data);
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ if (wdata->fldata.orig_offset) {
+ dprintk("%s new off %llu orig offset %llu\n", __func__,
+ wdata->args.offset, wdata->fldata.orig_offset);
+ wdata->args.offset = wdata->fldata.orig_offset;
+ }
+
+ /* Note this may cause RPC to be resent */
+ wdata->pdata.call_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_release(void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ put_lseg(wdata->pdata.lseg);
+ wdata->pdata.lseg = NULL;
+ wdata->pdata.call_ops->rpc_release(data);
+}
+
+struct rpc_call_ops filelayout_read_call_ops = {
+ .rpc_call_prepare = nfs_read_prepare,
+ .rpc_call_done = filelayout_read_call_done,
+ .rpc_release = filelayout_read_release,
+};
+
+struct rpc_call_ops filelayout_write_call_ops = {
+ .rpc_call_prepare = nfs_write_prepare,
+ .rpc_call_done = filelayout_write_call_done,
+ .rpc_release = filelayout_write_release,
+};
+
+/* Perform sync or async reads.
+ *
+ * An optimization for the NFS file layout driver
+ * allows the original read/write data structs to be passed in the
+ * last argument.
+ *
+ * TODO: join with write_pagelist?
+ */
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages)
+{
+ struct pnfs_layout_segment *lseg = data->pdata.lseg;
+ struct nfs4_pnfs_ds *ds;
+ loff_t offset = data->args.offset;
+ u32 idx;
+ struct nfs_fh *fh;
+
+ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n",
+ __func__, data->inode->i_ino, nr_pages,
+ data->args.pgbase, (size_t)data->args.count, offset);
+
+ /* Retrieve the correct rpc_client for the byte range */
+ idx = nfs4_fl_calc_ds_index(lseg, offset);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds) {
+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ dprintk("%s USE DS:ip %x %hu\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+ /* just try the first data server for the index..*/
+ data->fldata.ds_nfs_client = ds->ds_clp;
+ fh = nfs4_fl_select_ds_fh(lseg, offset);
+ if (fh)
+ data->args.fh = fh;
+
+ /*
+ * Now get the file offset on the dserver
+ * Set the read offset to this offset, and
+ * save the original offset in orig_offset
+ * In the case of aync reads, the offset will be reset in the
+ * call_ops->rpc_call_done() routine.
+ */
+ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ data->fldata.orig_offset = offset;
+
+ /* Perform an asynchronous read */
+ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+ &filelayout_read_call_ops);
+
+ data->pdata.pnfs_error = 0;
+
+ return PNFS_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync)
+{
+ struct pnfs_layout_segment *lseg = data->pdata.lseg;
+ struct nfs4_pnfs_ds *ds;
+ loff_t offset = data->args.offset;
+ u32 idx;
+ struct nfs_fh *fh;
+
+ /* Retrieve the correct rpc_client for the byte range */
+ idx = nfs4_fl_calc_ds_index(lseg, offset);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds) {
+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+ data->inode->i_ino, sync, (size_t) data->args.count, offset,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+ data->fldata.ds_nfs_client = ds->ds_clp;
+ fh = nfs4_fl_select_ds_fh(lseg, offset);
+ if (fh)
+ data->args.fh = fh;
+ /*
+ * Get the file offset on the dserver. Set the write offset to
+ * this offset and save the original offset.
+ */
+ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ data->fldata.orig_offset = offset;
+
+ /*
+ * Perform an asynchronous write The offset will be reset in the
+ * call_ops->rpc_call_done() routine
+ */
+ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+ &filelayout_write_call_ops, sync);
+
+ data->pdata.pnfs_error = 0;
+ return PNFS_ATTEMPTED;
+}
+
/*
* filelayout_check_layout()
*
@@ -87,13 +281,13 @@ filelayout_check_layout(struct pnfs_layo
dprintk("--> %s\n", __func__);
if (fl->pattern_offset > lgr->range.offset) {
- dprintk("%s pattern_offset %lld to large\n",
+ dprintk("%s pattern_offset %lld too large\n",
__func__, fl->pattern_offset);
goto out;
}
- if (fl->stripe_unit % PAGE_SIZE) {
- dprintk("%s Stripe unit (%u) not page aligned\n",
+ if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
+ dprintk("%s Invalid stripe unit (%u)\n",
__func__, fl->stripe_unit);
goto out;
}
@@ -252,14 +446,229 @@ filelayout_free_lseg(struct pnfs_layout_
_filelayout_free_lseg(fl);
}
+/* Allocate a new nfs_write_data struct and initialize */
+static struct nfs_write_data *
+filelayout_clone_write_data(struct nfs_write_data *old)
+{
+ static struct nfs_write_data *new;
+
+ new = nfs_commitdata_alloc();
+ if (!new)
+ goto out;
+ kref_init(&new->refcount);
+ new->parent = old;
+ kref_get(&old->refcount);
+ new->inode = old->inode;
+ new->cred = old->cred;
+ new->args.offset = 0;
+ new->args.count = 0;
+ new->res.count = 0;
+ new->res.fattr = &new->fattr;
+ nfs_fattr_init(&new->fattr);
+ new->res.verf = &new->verf;
+ new->args.context = get_nfs_open_context(old->args.context);
+ new->pdata.lseg = NULL;
+ new->pdata.call_ops = old->pdata.call_ops;
+ new->pdata.how = old->pdata.how;
+out:
+ return new;
+}
+
+static void filelayout_commit_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ wdata->pdata.call_ops->rpc_call_done(task, data);
+}
+
+static struct rpc_call_ops filelayout_commit_call_ops = {
+ .rpc_call_prepare = nfs_write_prepare,
+ .rpc_call_done = filelayout_commit_call_done,
+ .rpc_release = filelayout_write_release,
+};
+
+/*
+ * Execute a COMMIT op to the MDS or to each data server on which a page
+ * in 'pages' exists.
+ * Invoke the pnfs_commit_complete callback.
+ */
+enum pnfs_try_status
+filelayout_commit(struct nfs_write_data *data, int sync)
+{
+ LIST_HEAD(head);
+ struct nfs_page *req;
+ loff_t file_offset = 0;
+ u16 idx, i;
+ struct list_head **ds_page_list = NULL;
+ u16 *indices_used;
+ int num_indices_seen = 0;
+ bool used_mds = false;
+ const struct rpc_call_ops *call_ops;
+ struct rpc_clnt *clnt;
+ struct nfs_write_data **clone_list = NULL;
+ struct nfs_write_data *dsdata;
+ struct nfs4_pnfs_ds *ds;
+
+ dprintk("%s data %p sync %d\n", __func__, data, sync);
+
+ /* Alloc room for both in one go */
+ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) *
+ (sizeof(u16) + sizeof(struct list_head *)),
+ GFP_KERNEL);
+ if (!ds_page_list)
+ goto mem_error;
+ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1);
+ /*
+ * Sort pages based on which ds to send to.
+ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT.
+ * Note we are assuming there is only a single lseg in play.
+ * When that is not true, we could first sort on lseg, then
+ * sort within each as we do here.
+ */
+ while (!list_empty(&data->pages)) {
+ req = nfs_list_entry(data->pages.next);
+ nfs_list_remove_request(req);
+ if (!req->wb_lseg ||
+ ((struct nfs4_filelayout_segment *)
+ FILELAYOUT_LSEG(req->wb_lseg))->commit_through_mds)
+ idx = NFS4_PNFS_MAX_MULTI_CNT;
+ else {
+ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
+ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset);
+ }
+ if (ds_page_list[idx]) {
+ /* Already seen this idx */
+ list_add(&req->wb_list, ds_page_list[idx]);
+ } else {
+ /* New idx not seen so far */
+ list_add_tail(&req->wb_list, &head);
+ indices_used[num_indices_seen++] = idx;
+ }
+ ds_page_list[idx] = &req->wb_list;
+ }
+ /* Once created, clone must be released via call_op */
+ clone_list = kzalloc(num_indices_seen *
+ sizeof(struct nfs_write_data *), GFP_KERNEL);
+ if (!clone_list)
+ goto mem_error;
+ for (i = 0; i < num_indices_seen - 1; i++) {
+ if (indices_used[i] == NFS4_PNFS_MAX_MULTI_CNT) {
+ used_mds = true;
+ clone_list[i] = data;
+ } else {
+ clone_list[i] = filelayout_clone_write_data(data);
+ if (!clone_list[i])
+ goto mem_error;
+ }
+ }
+ if (used_mds) {
+ clone_list[i] = filelayout_clone_write_data(data);
+ if (!clone_list[i])
+ goto mem_error;
+ } else
+ clone_list[i] = data;
+ /*
+ * Now send off the RPCs to each ds. Note that it is important
+ * that any RPC to the MDS be sent last (or at least after all
+ * clones have been made.)
+ */
+ for (i = 0; i < num_indices_seen; i++) {
+ dsdata = clone_list[i];
+ idx = indices_used[i];
+ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]);
+ if (idx == NFS4_PNFS_MAX_MULTI_CNT) {
+ call_ops = data->pdata.call_ops;;
+ clnt = NFS_CLIENT(dsdata->inode);
+ ds = NULL;
+ } else {
+ struct nfs_fh *fh;
+
+ call_ops = &filelayout_commit_call_ops;
+ req = nfs_list_entry(dsdata->pages.next);
+ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx);
+ if (!ds) {
+ /* Trigger retry of this chunk through MDS */
+ dsdata->task.tk_status = -EIO;
+ data->pdata.call_ops->rpc_release(dsdata);
+ continue;
+ }
+ clnt = ds->ds_clp->cl_rpcclient;
+ dsdata->fldata.ds_nfs_client = ds->ds_clp;
+ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
+ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset);
+ if (fh)
+ dsdata->args.fh = fh;
+ }
+ dprintk("%s: Initiating commit: %llu USE DS:\n",
+ __func__, file_offset);
+ ifdebug(FACILITY)
+ print_ds(ds);
+
+ /* Send COMMIT to data server */
+ nfs_initiate_commit(dsdata, clnt, call_ops, sync);
+ }
+ kfree(clone_list);
+ kfree(ds_page_list);
+ data->pdata.pnfs_error = 0;
+ return PNFS_ATTEMPTED;
+
+ mem_error:
+ if (clone_list) {
+ for (i = 0; i < num_indices_seen - 1; i++) {
+ if (!clone_list[i])
+ break;
+ data->pdata.call_ops->rpc_release(clone_list[i]);
+ }
+ kfree(clone_list);
+ }
+ kfree(ds_page_list);
+ /* One of these will be empty, but doesn't hurt to do both */
+ nfs_mark_list_commit(&head);
+ nfs_mark_list_commit(&data->pages);
+ data->pdata.call_ops->rpc_release(data);
+ return PNFS_ATTEMPTED;
+}
+
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 : coalesce page
+ * return 0 : don't coalesce page
+ *
+ * By the time this is called, we know req->wb_lseg == prev->wb_lseg
+ */
+int
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ u64 p_stripe, r_stripe;
+ u32 stripe_unit;
+
+ if (!req->wb_lseg)
+ return 1;
+ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+ stripe_unit = FILELAYOUT_LSEG(req->wb_lseg)->stripe_unit;
+
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
+
+ return (p_stripe == r_stripe);
+}
+
static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
.owner = THIS_MODULE,
+ .flags = PNFS_USE_RPC_CODE,
.set_layoutdriver = filelayout_set_layoutdriver,
.clear_layoutdriver = filelayout_clear_layoutdriver,
.alloc_lseg = filelayout_alloc_lseg,
.free_lseg = filelayout_free_lseg,
+ .pg_test = filelayout_pg_test,
+ .read_pagelist = filelayout_read_pagelist,
+ .write_pagelist = filelayout_write_pagelist,
+ .commit = filelayout_commit,
};
static int __init nfs4filelayout_init(void)
diff -up linux-2.6.38.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.38.noarch/fs/nfs/nfs4filelayoutdev.c
--- linux-2.6.38.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/nfs4filelayoutdev.c 2011-03-26 07:57:44.252821497 -0400
@@ -104,6 +104,109 @@ _data_server_lookup_locked(u32 ip_addr,
return NULL;
}
+/* Create an rpc to the data server defined in 'dev_list' */
+static int
+nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+ struct nfs_server *tmp;
+ struct sockaddr_in sin;
+ struct rpc_clnt *mds_clnt = mds_srv->client;
+ struct nfs_client *clp = mds_srv->nfs_client;
+ struct sockaddr *mds_addr;
+ int err = 0;
+
+ dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+ mds_clnt->cl_auth->au_flavor);
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ds->ds_ip_addr;
+ sin.sin_port = ds->ds_port;
+
+ /*
+ * If this DS is also the MDS, use the MDS session only if the
+ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role.
+ */
+ mds_addr = (struct sockaddr *)&clp->cl_addr;
+ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) {
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
+ printk(KERN_INFO
+ "ip:port %x:%hu is not a pNFS Data Server\n",
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+ err = -ENODEV;
+ } else {
+ atomic_inc(&clp->cl_count);
+ ds->ds_clp = clp;
+ dprintk("%s Using MDS Session for DS\n", __func__);
+ }
+ goto out;
+ }
+
+ /* Temporay server for nfs4_set_client */
+ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+ if (!tmp)
+ goto out;
+
+ /*
+ * Set a retrans, timeout interval, and authflavor equual to the MDS
+ * values. Use the MDS nfs_client cl_ipaddr field so as to use the
+ * same co_ownerid as the MDS.
+ */
+ err = nfs4_set_client(tmp,
+ mds_srv->nfs_client->cl_hostname,
+ (struct sockaddr *)&sin,
+ sizeof(struct sockaddr),
+ mds_srv->nfs_client->cl_ipaddr,
+ mds_clnt->cl_auth->au_flavor,
+ IPPROTO_TCP,
+ mds_clnt->cl_xprt->timeout,
+ 1 /* minorversion */);
+ if (err < 0)
+ goto out_free;
+
+ clp = tmp->nfs_client;
+
+ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */
+ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp);
+ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS;
+
+ err = nfs4_recover_expired_lease(clp);
+ if (!err)
+ err = nfs4_check_client_ready(clp);
+ if (err)
+ goto out_put;
+
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
+ printk(KERN_INFO "ip:port %x:%hu is not a pNFS Data Server\n",
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+ err = -ENODEV;
+ goto out_put;
+ }
+ /*
+ * Set DS lease equal to the MDS lease, renewal is scheduled in
+ * create_session
+ */
+ spin_lock(&mds_srv->nfs_client->cl_lock);
+ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+ spin_unlock(&mds_srv->nfs_client->cl_lock);
+ clp->cl_last_renewal = jiffies;
+
+ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ ds->ds_clp = clp;
+
+ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+ clp->cl_rpcclient);
+out_free:
+ kfree(tmp);
+out:
+ dprintk("%s Returns %d\n", __func__, err);
+ return err;
+out_put:
+ nfs_put_client(clp);
+ goto out_free;
+}
+
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
@@ -455,3 +558,72 @@ nfs4_fl_find_get_deviceid(struct nfs_cli
return (d == NULL) ? NULL :
container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
}
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+static u32
+_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u64 tmp;
+
+ tmp = offset - flseg->pattern_offset;
+ do_div(tmp, flseg->stripe_unit);
+ tmp += flseg->first_stripe_index;
+ return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ u32 j;
+
+ j = _nfs4_fl_calc_j_index(lseg, offset);
+ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u32 i;
+
+ if (flseg->stripe_type == STRIPE_SPARSE) {
+ if (flseg->num_fh == 1)
+ i = 0;
+ else if (flseg->num_fh == 0)
+ return NULL;
+ else
+ i = nfs4_fl_calc_ds_index(lseg, offset);
+ } else
+ i = _nfs4_fl_calc_j_index(lseg, offset);
+ return flseg->fh_array[i];
+}
+
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr;
+
+ dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+ if (dsaddr->ds_list[ds_idx] == NULL) {
+ printk(KERN_ERR "%s: No data server for device id!\n",
+ __func__);
+ return NULL;
+ }
+
+ if (!dsaddr->ds_list[ds_idx]->ds_clp) {
+ int err;
+
+ err = nfs4_pnfs_ds_create(NFS_SERVER(lseg->pls_layout->plh_inode),
+ dsaddr->ds_list[ds_idx]);
+ if (err) {
+ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n",
+ __func__, err);
+ return NULL;
+ }
+ }
+ return dsaddr->ds_list[ds_idx];
+}
diff -up linux-2.6.38.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.38.noarch/fs/nfs/nfs4filelayout.h
--- linux-2.6.38.noarch/fs/nfs/nfs4filelayout.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/nfs4filelayout.h 2011-03-26 07:57:44.252821497 -0400
@@ -83,9 +83,15 @@ FILELAYOUT_LSEG(struct pnfs_layout_segme
generic_hdr);
}
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset);
+
extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
extern void print_ds(struct nfs4_pnfs_ds *ds);
extern void print_deviceid(struct nfs4_deviceid *dev_id);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+ u32 ds_idx);
extern struct nfs4_file_layout_dsaddr *
nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
struct nfs4_file_layout_dsaddr *
diff -up linux-2.6.38.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.38.noarch/fs/nfs/nfs4_fs.h
--- linux-2.6.38.noarch/fs/nfs/nfs4_fs.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/nfs4_fs.h 2011-03-26 07:57:44.250821515 -0400
@@ -250,10 +250,12 @@ static inline struct nfs4_session *nfs4_
}
extern int nfs4_setup_sequence(const struct nfs_server *server,
+ struct nfs4_session *ds_session,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
int cache_reply, struct rpc_task *task);
extern void nfs4_destroy_session(struct nfs4_session *session);
extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *);
extern int nfs4_proc_create_session(struct nfs_client *);
extern int nfs4_proc_destroy_session(struct nfs4_session *);
extern int nfs4_init_session(struct nfs_server *server);
@@ -266,6 +268,7 @@ static inline struct nfs4_session *nfs4_
}
static inline int nfs4_setup_sequence(const struct nfs_server *server,
+ struct nfs4_session *ds_session,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
int cache_reply, struct rpc_task *task)
{
@@ -283,7 +286,7 @@ extern const struct nfs4_minor_version_o
extern const u32 nfs4_fattr_bitmap[2];
extern const u32 nfs4_statfs_bitmap[2];
extern const u32 nfs4_pathconf_bitmap[2];
-extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fsinfo_bitmap[3];
extern const u32 nfs4_fs_locations_bitmap[2];
/* nfs4renewd.c */
@@ -293,10 +296,10 @@ extern void nfs4_kill_renewd(struct nfs_
extern void nfs4_renew_state(struct work_struct *);
/* nfs4state.c */
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
#if defined(CONFIG_NFS_V4_1)
-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
extern void nfs4_schedule_session_recovery(struct nfs4_session *);
#else
@@ -305,6 +308,17 @@ static inline void nfs4_schedule_session
}
#endif /* CONFIG_NFS_V4_1 */
+static inline struct rpc_cred *
+nfs4_get_machine_cred(struct nfs_client *clp)
+{
+ struct rpc_cred *cred;
+
+ spin_lock(&clp->cl_lock);
+ cred = nfs4_get_machine_cred_locked(clp);
+ spin_unlock(&clp->cl_lock);
+ return cred;
+}
+
extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
diff -up linux-2.6.38.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.38.noarch/fs/nfs/nfs4proc.c
--- linux-2.6.38.noarch/fs/nfs/nfs4proc.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/nfs4proc.c 2011-03-26 07:57:44.255821471 -0400
@@ -70,7 +70,7 @@ struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *);
static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
@@ -128,12 +128,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
0
};
-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
| FATTR4_WORD0_MAXREAD
| FATTR4_WORD0_MAXWRITE
| FATTR4_WORD0_LEASE_TIME,
FATTR4_WORD1_TIME_DELTA
- | FATTR4_WORD1_FS_LAYOUT_TYPES
+ | FATTR4_WORD1_FS_LAYOUT_TYPES,
+ FATTR4_WORD2_LAYOUT_BLKSIZE
};
const u32 nfs4_fs_locations_bitmap[2] = {
@@ -573,6 +574,7 @@ static int nfs41_setup_sequence(struct n
}
int nfs4_setup_sequence(const struct nfs_server *server,
+ struct nfs4_session *ds_session,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
int cache_reply,
@@ -581,6 +583,8 @@ int nfs4_setup_sequence(const struct nfs
struct nfs4_session *session = nfs4_get_session(server);
int ret = 0;
+ if (ds_session)
+ session = ds_session;
if (session == NULL) {
args->sa_session = NULL;
res->sr_session = NULL;
@@ -611,7 +615,7 @@ static void nfs41_call_sync_prepare(stru
dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
- if (nfs4_setup_sequence(data->seq_server, data->seq_args,
+ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args,
data->seq_res, data->cache_reply, task))
return;
rpc_call_start(task);
@@ -1398,7 +1402,7 @@ static void nfs4_open_prepare(struct rpc
nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
}
data->timestamp = jiffies;
- if (nfs4_setup_sequence(data->o_arg.server,
+ if (nfs4_setup_sequence(data->o_arg.server, NULL,
&data->o_arg.seq_args,
&data->o_res.seq_res, 1, task))
return;
@@ -1573,9 +1577,8 @@ static int _nfs4_proc_open(struct nfs4_o
return 0;
}
-static int nfs4_recover_expired_lease(struct nfs_server *server)
+int nfs4_recover_expired_lease(struct nfs_client *clp)
{
- struct nfs_client *clp = server->nfs_client;
unsigned int loop;
int ret;
@@ -1591,6 +1594,7 @@ static int nfs4_recover_expired_lease(st
}
return ret;
}
+EXPORT_SYMBOL(nfs4_recover_expired_lease);
/*
* OPEN_EXPIRED:
@@ -1679,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d
dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
goto out_err;
}
- status = nfs4_recover_expired_lease(server);
+ status = nfs4_recover_expired_lease(server->nfs_client);
if (status != 0)
goto err_put_state_owner;
if (path->dentry->d_inode != NULL)
@@ -1897,7 +1901,7 @@ static void nfs4_close_done(struct rpc_t
if (calldata->arg.fmode == 0)
break;
default:
- if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
nfs_release_seqid(calldata->arg.seqid);
@@ -1949,7 +1953,7 @@ static void nfs4_close_prepare(struct rp
nfs_fattr_init(calldata->res.fattr);
calldata->timestamp = jiffies;
- if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
+ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL,
&calldata->arg.seq_args, &calldata->res.seq_res,
1, task))
return;
@@ -2269,6 +2273,9 @@ nfs4_proc_setattr(struct dentry *dentry,
struct nfs4_state *state = NULL;
int status;
+ if (pnfs_ld_layoutret_on_setattr(inode))
+ pnfs_return_layout(inode, NULL, true);
+
nfs_fattr_init(fattr);
/* Search for an existing open(O_WRITE) file */
@@ -2596,7 +2603,7 @@ static int nfs4_proc_unlink_done(struct
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN)
return 0;
update_changeattr(dir, &res->cinfo);
nfs_post_op_update_inode(dir, res->dir_attr);
@@ -2621,7 +2628,7 @@ static int nfs4_proc_rename_done(struct
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN)
return 0;
update_changeattr(old_dir, &res->old_cinfo);
@@ -3072,19 +3079,31 @@ static int nfs4_proc_pathconf(struct nfs
static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
{
struct nfs_server *server = NFS_SERVER(data->inode);
+ struct nfs_client *client = server->nfs_client;
dprintk("--> %s\n", __func__);
+#ifdef CONFIG_NFS_V4_1
+ if (data->pdata.pnfsflags & PNFS_NO_RPC)
+ return 0;
+
+ /* Is this a DS session */
+ if (data->fldata.ds_nfs_client) {
+ dprintk("%s DS read\n", __func__);
+ client = data->fldata.ds_nfs_client;
+ }
+#endif /* CONFIG_NFS_V4_1 */
+
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
- nfs_restart_rpc(task, server->nfs_client);
+ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
+ nfs_restart_rpc(task, client);
return -EAGAIN;
}
nfs_invalidate_atime(data->inode);
- if (task->tk_status > 0)
+ if (task->tk_status > 0 && client == server->nfs_client)
renew_lease(server, data->timestamp);
return 0;
}
@@ -3095,20 +3114,56 @@ static void nfs4_proc_read_setup(struct
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
}
+static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data)
+{
+#ifdef CONFIG_NFS_V4_1
+ pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
+ pnfs_need_layoutcommit(nfsi, data->args.context);
+#endif /* CONFIG_NFS_V4_1 */
+}
+
static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->inode;
-
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *client = server->nfs_client;
+
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+#ifdef CONFIG_NFS_V4_1
+ /* restore original count after retry? */
+ if (data->pdata.orig_count) {
+ dprintk("%s: restoring original count %u\n", __func__,
+ data->pdata.orig_count);
+ data->args.count = data->pdata.orig_count;
+ }
+
+ if (data->pdata.pnfsflags & PNFS_NO_RPC)
+ return 0;
+
+ /* Is this a DS session */
+ if (data->fldata.ds_nfs_client) {
+ dprintk("%s DS write\n", __func__);
+ client = data->fldata.ds_nfs_client;
+ }
+#endif /* CONFIG_NFS_V4_1 */
+
+ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
+ nfs_restart_rpc(task, client);
return -EAGAIN;
}
+
+ /*
+ * MDS write: renew lease
+ * DS write: update lastbyte written, mark for layout commit
+ */
if (task->tk_status >= 0) {
- renew_lease(NFS_SERVER(inode), data->timestamp);
- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+ if (client == server->nfs_client) {
+ renew_lease(server, data->timestamp);
+ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+ } else
+ pnfs4_update_write_done(NFS_I(inode), data);
}
return 0;
}
@@ -3121,21 +3176,42 @@ static void nfs4_proc_write_setup(struct
data->res.server = server;
data->timestamp = jiffies;
+#ifdef CONFIG_NFS_V4_1
+ /* writes to DS use pnfs vector */
+ if (data->fldata.ds_nfs_client) {
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE];
+ return;
+ }
+#endif /* CONFIG_NFS_V4_1 */
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
}
static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->inode;
-
+ struct nfs_server *server = NFS_SERVER(data->inode);
+ struct nfs_client *client = server->nfs_client;
+
+#ifdef CONFIG_NFS_V4_1
+ if (data->pdata.pnfsflags & PNFS_NO_RPC)
+ return 0;
+
+ /* Is this a DS session */
+ if (data->fldata.ds_nfs_client) {
+ dprintk("%s DS commit\n", __func__);
+ client = data->fldata.ds_nfs_client;
+ }
+#endif /* CONFIG_NFS_V4_1 */
+
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) {
nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
return -EAGAIN;
}
- nfs_refresh_inode(inode, data->res.fattr);
+ if (client == server->nfs_client)
+ nfs_refresh_inode(inode, data->res.fattr);
return 0;
}
@@ -3145,6 +3221,12 @@ static void nfs4_proc_commit_setup(struc
data->args.bitmask = server->cache_consistency_bitmask;
data->res.server = server;
+#if defined(CONFIG_NFS_V4_1)
+ if (data->fldata.ds_nfs_client) {
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT];
+ return;
+ }
+#endif /* CONFIG_NFS_V4_1 */
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
}
@@ -3490,9 +3572,10 @@ static int nfs4_proc_set_acl(struct inod
}
static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp)
{
- struct nfs_client *clp = server->nfs_client;
+ if (!clp)
+ clp = server->nfs_client;
if (task->tk_status >= 0)
return 0;
@@ -3524,7 +3607,8 @@ nfs4_async_handle_error(struct rpc_task
return -EAGAIN;
#endif /* CONFIG_NFS_V4_1 */
case -NFS4ERR_DELAY:
- nfs_inc_server_stats(server, NFSIOS_DELAY);
+ if (server)
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
case -EKEYEXPIRED:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
@@ -3537,6 +3621,8 @@ nfs4_async_handle_error(struct rpc_task
task->tk_status = nfs4_map_errors(task->tk_status);
return 0;
wait_on_recovery:
+ if (is_ds_only_client(clp))
+ return 0;
rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
@@ -3669,8 +3755,8 @@ static void nfs4_delegreturn_done(struct
renew_lease(data->res.server, data->timestamp);
break;
default:
- if (nfs4_async_handle_error(task, data->res.server, NULL) ==
- -EAGAIN) {
+ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL)
+ == -EAGAIN) {
nfs_restart_rpc(task, data->res.server->nfs_client);
return;
}
@@ -3690,7 +3776,7 @@ static void nfs4_delegreturn_prepare(str
d_data = (struct nfs4_delegreturndata *)data;
- if (nfs4_setup_sequence(d_data->res.server,
+ if (nfs4_setup_sequence(d_data->res.server, NULL,
&d_data->args.seq_args,
&d_data->res.seq_res, 1, task))
return;
@@ -3924,7 +4010,7 @@ static void nfs4_locku_done(struct rpc_t
case -NFS4ERR_EXPIRED:
break;
default:
- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN)
nfs_restart_rpc(task,
calldata->server->nfs_client);
}
@@ -3942,7 +4028,7 @@ static void nfs4_locku_prepare(struct rp
return;
}
calldata->timestamp = jiffies;
- if (nfs4_setup_sequence(calldata->server,
+ if (nfs4_setup_sequence(calldata->server, NULL,
&calldata->arg.seq_args,
&calldata->res.seq_res, 1, task))
return;
@@ -4097,7 +4183,7 @@ static void nfs4_lock_prepare(struct rpc
} else
data->arg.new_lock_owner = 0;
data->timestamp = jiffies;
- if (nfs4_setup_sequence(data->server,
+ if (nfs4_setup_sequence(data->server, NULL,
&data->arg.seq_args,
&data->res.seq_res, 1, task))
return;
@@ -5112,7 +5198,7 @@ int nfs4_init_session(struct nfs_server
session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
- ret = nfs4_recover_expired_lease(server);
+ ret = nfs4_recover_expired_lease(server->nfs_client);
if (!ret)
ret = nfs4_check_client_ready(clp);
return ret;
@@ -5381,7 +5467,7 @@ nfs4_layoutget_prepare(struct rpc_task *
* However, that is not so catastrophic, and there seems
* to be no way to prevent it completely.
*/
- if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
&lgp->res.seq_res, 0, task))
return;
if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
@@ -5390,6 +5476,7 @@ nfs4_layoutget_prepare(struct rpc_task *
rpc_exit(task, NFS4_OK);
return;
}
+
rpc_call_start(task);
}
@@ -5398,11 +5485,16 @@ static void nfs4_layoutget_done(struct r
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
- dprintk("--> %s\n", __func__);
+ dprintk("--> %s: tk_status=%d\n", __func__, task->tk_status);
- if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) {
+ /* layout code relies on fact that in this case
+ * code falls back to tk_action=call_start, but not
+ * back to rpc_prepare_task, to keep plh_outstanding
+ * correct.
+ */
return;
-
+ }
switch (task->tk_status) {
case 0:
break;
@@ -5411,7 +5503,8 @@ static void nfs4_layoutget_done(struct r
task->tk_status = -NFS4ERR_DELAY;
/* Fall through */
default:
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
+ dprintk("<-- %s retrying\n", __func__);
rpc_restart_call_prepare(task);
return;
}
@@ -5477,6 +5570,241 @@ int nfs4_proc_layoutget(struct nfs4_layo
return status;
}
+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_layoutcommit_data *ldata =
+ (struct nfs4_layoutcommit_data *)data;
+ struct nfs_server *server = NFS_SERVER(ldata->args.inode);
+
+ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args,
+ &ldata->res.seq_res, 1, task))
+ return;
+ ldata->res.status = -1;
+ rpc_call_start(task);
+}
+
+static void
+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutcommit_data *data =
+ (struct nfs4_layoutcommit_data *)calldata;
+ struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
+ nfs_restart_rpc(task, server->nfs_client);
+}
+
+static void nfs4_layoutcommit_release(void *lcdata)
+{
+ struct nfs4_layoutcommit_data *data =
+ (struct nfs4_layoutcommit_data *)lcdata;
+
+ pnfs_cleanup_layoutcommit(data->args.inode, data);
+ /* Matched by get_layout in pnfs_layoutcommit_inode */
+ put_layout_hdr(NFS_I(data->args.inode)->layout);
+ put_rpccred(data->cred);
+ kfree(lcdata);
+}
+
+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
+ .rpc_call_prepare = nfs4_layoutcommit_prepare,
+ .rpc_call_done = nfs4_layoutcommit_done,
+ .rpc_release = nfs4_layoutcommit_release,
+};
+
+/* Execute a layoutcommit to the server */
+int
+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .task = &data->task,
+ .rpc_client = NFS_CLIENT(data->args.inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutcommit_ops,
+ .callback_data = data,
+ .flags = RPC_TASK_ASYNC,
+ };
+ struct rpc_task *task;
+ int status = 0;
+
+ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu "
+ "type: %d issync %d\n",
+ data->task.tk_pid,
+ data->args.range.length,
+ data->args.range.offset,
+ data->args.lastbytewritten,
+ data->args.layout_type, issync);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (!issync)
+ goto out;
+ status = nfs4_wait_for_completion_rpc_task(task);
+ if (status != 0)
+ goto out;
+ status = task->tk_status;
+out:
+ dprintk("%s: status %d\n", __func__, status);
+ rpc_put_task(task);
+ return status;
+}
+
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+ &lrp->res.seq_res, 0, task))
+ return;
+ rpc_call_start(task);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+ struct nfs_server *server;
+
+ dprintk("--> %s\n", __func__);
+
+ if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+ return;
+
+ if (lrp->args.return_type == RETURN_FILE)
+ server = NFS_SERVER(lrp->args.inode);
+ else
+ server = NULL;
+ if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN) {
+ nfs_restart_rpc(task, lrp->clp);
+ return;
+ }
+ if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+
+ spin_lock(&lo->plh_inode->i_lock);
+ if (lrp->res.lrs_present)
+ pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+ else
+ BUG_ON(!list_empty(&lo->plh_segs));
+ spin_unlock(&lo->plh_inode->i_lock);
+ }
+ dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
+ if (lrp->args.return_type == RETURN_FILE) {
+ struct inode *ino = lrp->args.inode;
+ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+
+ put_layout_hdr(lo);
+ }
+ kfree(calldata);
+ dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+ .rpc_call_prepare = nfs4_layoutreturn_prepare,
+ .rpc_call_done = nfs4_layoutreturn_done,
+ .rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+ .rpc_argp = &lrp->args,
+ .rpc_resp = &lrp->res,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = lrp->clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutreturn_call_ops,
+ .callback_data = lrp,
+ .flags = RPC_TASK_ASYNC,
+ };
+ int status = 0;
+
+ dprintk("--> %s\n", __func__);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (!issync)
+ goto out;
+ status = nfs4_wait_for_completion_rpc_task(task);
+ if (status != 0)
+ goto out;
+ status = task->tk_status;
+out:
+ dprintk("<-- %s\n", __func__);
+ rpc_put_task(task);
+ return status;
+}
+
+/*
+ * Retrieve the list of Data Server devices from the MDS.
+ */
+static int _nfs4_getdevicelist(struct nfs_server *server,
+ const struct nfs_fh *fh,
+ struct pnfs_devicelist *devlist)
+{
+ struct nfs4_getdevicelist_args args = {
+ .fh = fh,
+ .layoutclass = server->pnfs_curr_ld->id,
+ };
+ struct nfs4_getdevicelist_res res = {
+ .devlist = devlist,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = nfs4_get_machine_cred(server->nfs_client),
+ };
+ int status;
+
+ dprintk("--> %s\n", __func__);
+ status = nfs4_call_sync(server, &msg, &args, &res, 0);
+ put_rpccred(msg.rpc_cred);
+ dprintk("<-- %s status=%d\n", __func__, status);
+ return status;
+}
+
+int nfs4_proc_getdevicelist(struct nfs_server *server,
+ const struct nfs_fh *fh,
+ struct pnfs_devicelist *devlist)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_getdevicelist(server, fh, devlist),
+ &exception);
+ } while (exception.retry);
+
+ dprintk("%s: err=%d, num_devs=%u\n", __func__,
+ err, devlist->num_devs);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
+
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
{
@@ -5490,11 +5818,13 @@ _nfs4_proc_getdeviceinfo(struct nfs_serv
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
.rpc_argp = &args,
.rpc_resp = &res,
+ .rpc_cred = nfs4_get_machine_cred(server->nfs_client),
};
int status;
dprintk("--> %s\n", __func__);
status = nfs4_call_sync(server, &msg, &args, &res, 0);
+ put_rpccred(msg.rpc_cred);
dprintk("<-- %s status=%d\n", __func__, status);
return status;
diff -up linux-2.6.38.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.38.noarch/fs/nfs/nfs4renewd.c
--- linux-2.6.38.noarch/fs/nfs/nfs4renewd.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/nfs4renewd.c 2011-03-26 07:57:44.257821455 -0400
@@ -65,7 +65,7 @@ nfs4_renew_state(struct work_struct *wor
dprintk("%s: start\n", __func__);
rcu_read_lock();
- if (list_empty(&clp->cl_superblocks)) {
+ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) {
rcu_read_unlock();
goto out;
}
diff -up linux-2.6.38.noarch/fs/nfs/nfs4state.c.orig linux-2.6.38.noarch/fs/nfs/nfs4state.c
--- linux-2.6.38.noarch/fs/nfs/nfs4state.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/nfs4state.c 2011-03-26 07:57:44.258821447 -0400
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(str
int status;
struct nfs_fsinfo fsinfo;
+ if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+ nfs4_schedule_state_renewal(clp);
+ return 0;
+ }
+
status = nfs4_proc_get_lease_time(clp, &fsinfo);
if (status == 0) {
/* Update lease time and schedule renewal */
diff -up linux-2.6.38.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.38.noarch/fs/nfs/nfs4xdr.c
--- linux-2.6.38.noarch/fs/nfs/nfs4xdr.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/nfs4xdr.c 2011-03-26 07:57:44.261821422 -0400
@@ -90,7 +90,7 @@ static int nfs4_stat_to_errno(int);
#define encode_getfh_maxsz (op_encode_hdr_maxsz)
#define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \
((3+NFS4_FHSIZE) >> 2))
-#define nfs4_fattr_bitmap_maxsz 3
+#define nfs4_fattr_bitmap_maxsz 4
#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
#define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
@@ -112,7 +112,11 @@ static int nfs4_stat_to_errno(int);
#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
#define encode_fsinfo_maxsz (encode_getattr_maxsz)
-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11)
+/* The 5 accounts for the PNFS attributes, and assumes that at most three
+ * layout types will be returned.
+ */
+#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
+ nfs4_fattr_bitmap_maxsz + 8 + 5)
#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
#define decode_renew_maxsz (op_decode_hdr_maxsz)
#define encode_setclientid_maxsz \
@@ -311,6 +315,17 @@ static int nfs4_stat_to_errno(int);
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
+#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
+ encode_verifier_maxsz)
+#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
+ 2 /* nfs_cookie4 gdlr_cookie */ + \
+ decode_verifier_maxsz \
+ /* verifier4 gdlr_verifier */ + \
+ 1 /* gdlr_deviceid_list count */ + \
+ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
+ NFS4_DEVICEID4_SIZE) \
+ /* gdlr_deviceid_list */ + \
+ 1 /* bool gdlr_eof */)
#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -324,6 +339,17 @@ static int nfs4_stat_to_errno(int);
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
decode_stateid_maxsz + \
XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+#define encode_layoutcommit_maxsz (18 + \
+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \
+ op_encode_hdr_maxsz + \
+ encode_stateid_maxsz)
+#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz)
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 1 /* FIXME: opaque lrf_body always empty at
+ *the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+ 1 + decode_stateid_maxsz)
#else /* CONFIG_NFS_V4_1 */
#define encode_sequence_maxsz 0
#define decode_sequence_maxsz 0
@@ -713,6 +739,14 @@ static int nfs4_stat_to_errno(int);
#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getdevicelist_maxsz)
+#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getdevicelist_maxsz)
#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz +\
encode_getdeviceinfo_maxsz)
@@ -727,6 +761,38 @@ static int nfs4_stat_to_errno(int);
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_layoutget_maxsz)
+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz +\
+ encode_putfh_maxsz + \
+ encode_layoutcommit_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutcommit_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz)
+#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz +\
+ encode_putfh_maxsz + \
+ encode_write_maxsz)
+#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_write_maxsz)
+#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_commit_maxsz)
+#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_commit_maxsz)
const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
compound_encode_hdr_maxsz +
@@ -1031,6 +1097,35 @@ static void encode_getattr_two(struct xd
hdr->replen += decode_getattr_maxsz;
}
+static void
+encode_getattr_three(struct xdr_stream *xdr,
+ uint32_t bm0, uint32_t bm1, uint32_t bm2,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_GETATTR);
+ if (bm2) {
+ p = reserve_space(xdr, 16);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(bm0);
+ *p++ = cpu_to_be32(bm1);
+ *p = cpu_to_be32(bm2);
+ } else if (bm1) {
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bm0);
+ *p = cpu_to_be32(bm1);
+ } else {
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(bm0);
+ }
+ hdr->nops++;
+ hdr->replen += decode_getattr_maxsz;
+}
+
static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
@@ -1039,8 +1134,11 @@ static void encode_getfattr(struct xdr_s
static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
+ encode_getattr_three(xdr,
+ bitmask[0] & nfs4_fsinfo_bitmap[0],
+ bitmask[1] & nfs4_fsinfo_bitmap[1],
+ bitmask[2] & nfs4_fsinfo_bitmap[2],
+ hdr);
}
static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1767,6 +1865,26 @@ static void encode_sequence(struct xdr_s
#ifdef CONFIG_NFS_V4_1
static void
+encode_getdevicelist(struct xdr_stream *xdr,
+ const struct nfs4_getdevicelist_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+ nfs4_verifier dummy = {
+ .data = "dummmmmy",
+ };
+
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(OP_GETDEVICELIST);
+ *p++ = cpu_to_be32(args->layoutclass);
+ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
+ xdr_encode_hyper(p, 0ULL); /* cookie */
+ encode_nfs4_verifier(xdr, &dummy);
+ hdr->nops++;
+ hdr->replen += decode_getdevicelist_maxsz;
+}
+
+static void
encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfs4_getdeviceinfo_args *args,
struct compound_hdr *hdr)
@@ -1812,6 +1930,102 @@ encode_layoutget(struct xdr_stream *xdr,
hdr->nops++;
hdr->replen += decode_layoutget_maxsz;
}
+
+static void
+encode_layoutcommit(struct xdr_stream *xdr,
+ struct inode *inode,
+ const struct nfs4_layoutcommit_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__,
+ args->range.length, args->range.offset, args->lastbytewritten,
+ args->layout_type);
+
+ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
+ *p++ = cpu_to_be32(0); /* reclaim */
+ p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+ p = xdr_encode_hyper(p, args->lastbytewritten);
+ *p = cpu_to_be32(args->time_modify_changed != 0);
+ if (args->time_modify_changed) {
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(args->time_modify.tv_sec);
+ *p = cpu_to_be32(args->time_modify.tv_nsec);
+ }
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(args->layout_type);
+
+ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
+ NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+ NFS_I(inode)->layout, xdr, args);
+ } else {
+ p = reserve_space(xdr, 4);
+ xdr_encode_opaque(p, NULL, 0);
+ }
+
+ hdr->nops++;
+ hdr->replen += decode_layoutcommit_maxsz;
+}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+ nfs4_stateid stateid;
+ __be32 *p;
+
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(OP_LAYOUTRETURN);
+ *p++ = cpu_to_be32(args->reclaim);
+ *p++ = cpu_to_be32(args->layout_type);
+ *p++ = cpu_to_be32(args->range.iomode);
+ *p = cpu_to_be32(args->return_type);
+ if (args->return_type == RETURN_FILE) {
+ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
+ spin_lock(&args->inode->i_lock);
+ memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data,
+ NFS4_STATEID_SIZE);
+ spin_unlock(&args->inode->i_lock);
+ p = xdr_encode_opaque_fixed(p, &stateid.data,
+ NFS4_STATEID_SIZE);
+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+ NFS_I(args->inode)->layout, xdr, args);
+ } else {
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0);
+ }
+ }
+ hdr->nops++;
+ hdr->replen += decode_layoutreturn_maxsz;
+}
+#else /* CONFIG_NFS_V4_1 */
+static int
+encode_layoutcommit(struct xdr_stream *xdr,
+ struct inode *inode,
+ const struct nfs4_layoutcommit_args *args,
+ struct compound_hdr *hdr)
+{
+ return 0;
+}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+}
+
#endif /* CONFIG_NFS_V4_1 */
/*
@@ -2408,7 +2622,7 @@ static void nfs4_xdr_enc_setclientid_con
struct compound_hdr hdr = {
.nops = 0,
};
- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
encode_compound_hdr(xdr, req, &hdr);
encode_setclientid_confirm(xdr, arg, &hdr);
@@ -2534,7 +2748,7 @@ static void nfs4_xdr_enc_get_lease_time(
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
};
- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->la_seq_args, &hdr);
@@ -2561,6 +2775,24 @@ static void nfs4_xdr_enc_reclaim_complet
}
/*
+ * Encode GETDEVICELIST request
+ */
+static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_getdevicelist_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getdevicelist(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode GETDEVICEINFO request
*/
static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -2601,6 +2833,81 @@ static void nfs4_xdr_enc_layoutget(struc
encode_layoutget(xdr, args, &hdr);
encode_nops(&hdr);
}
+
+/*
+ * Encode LAYOUTCOMMIT request
+ */
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutcommit_args *args)
+{
+ struct nfs4_layoutcommit_data *data =
+ container_of(args, struct nfs4_layoutcommit_data, args);
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_layoutcommit(xdr, data->args.inode, args, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutreturn(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a pNFS File Layout Data Server WRITE request
+ */
+static void nfs4_xdr_enc_dswrite(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs_writeargs *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_write(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a pNFS File Layout Data Server COMMIT request
+ */
+static void nfs4_xdr_enc_dscommit(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs_writeargs *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_commit(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
#endif /* CONFIG_NFS_V4_1 */
static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2701,14 +3008,17 @@ static int decode_attr_bitmap(struct xdr
goto out_overflow;
bmlen = be32_to_cpup(p);
- bitmap[0] = bitmap[1] = 0;
+ bitmap[0] = bitmap[1] = bitmap[2] = 0;
p = xdr_inline_decode(xdr, (bmlen << 2));
if (unlikely(!p))
goto out_overflow;
if (bmlen > 0) {
bitmap[0] = be32_to_cpup(p++);
- if (bmlen > 1)
- bitmap[1] = be32_to_cpup(p);
+ if (bmlen > 1) {
+ bitmap[1] = be32_to_cpup(p++);
+ if (bmlen > 2)
+ bitmap[2] = be32_to_cpup(p);
+ }
}
return 0;
out_overflow:
@@ -2740,8 +3050,9 @@ static int decode_attr_supported(struct
return ret;
bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
} else
- bitmask[0] = bitmask[1] = 0;
- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
return 0;
}
@@ -3794,7 +4105,7 @@ out_overflow:
static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
{
__be32 *savep;
- uint32_t attrlen, bitmap[2] = {0};
+ uint32_t attrlen, bitmap[3] = {0};
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3820,7 +4131,7 @@ xdr_error:
static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
{
__be32 *savep;
- uint32_t attrlen, bitmap[2] = {0};
+ uint32_t attrlen, bitmap[3] = {0};
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3852,7 +4163,7 @@ xdr_error:
static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
{
__be32 *savep;
- uint32_t attrlen, bitmap[2] = {0};
+ uint32_t attrlen, bitmap[3] = {0};
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3994,7 +4305,7 @@ static int decode_getfattr_generic(struc
{
__be32 *savep;
uint32_t attrlen,
- bitmap[2] = {0};
+ bitmap[3] = {0};
int status;
status = decode_op_hdr(xdr, OP_GETATTR);
@@ -4080,10 +4391,32 @@ static int decode_attr_pnfstype(struct x
return status;
}
+/*
+ * The prefered block size for layout directed io
+ */
+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p)) {
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+ }
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
+ }
+ return 0;
+}
+
static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
{
__be32 *savep;
- uint32_t attrlen, bitmap[2];
+ uint32_t attrlen, bitmap[3];
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4111,6 +4444,9 @@ static int decode_fsinfo(struct xdr_stre
status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
if (status != 0)
goto xdr_error;
+ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
+ if (status)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
@@ -4530,7 +4866,7 @@ static int decode_getacl(struct xdr_stre
{
__be32 *savep;
uint32_t attrlen,
- bitmap[2] = {0};
+ bitmap[3] = {0};
struct kvec *iov = req->rq_rcv_buf.head;
int status;
@@ -4878,6 +5214,50 @@ out_overflow:
}
#if defined(CONFIG_NFS_V4_1)
+/*
+ * TODO: Need to handle case when EOF != true;
+ */
+static int decode_getdevicelist(struct xdr_stream *xdr,
+ struct pnfs_devicelist *res)
+{
+ __be32 *p;
+ int status, i;
+ struct nfs_writeverf verftemp;
+
+ status = decode_op_hdr(xdr, OP_GETDEVICELIST);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ /* TODO: Skip cookie for now */
+ p += 2;
+
+ /* Read verifier */
+ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
+
+ res->num_devs = be32_to_cpup(p);
+
+ dprintk("%s: num_dev %d\n", __func__, res->num_devs);
+
+ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM)
+ return -NFS4ERR_REP_TOO_BIG;
+
+ p = xdr_inline_decode(xdr,
+ res->num_devs * NFS4_DEVICEID4_SIZE + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ for (i = 0; i < res->num_devs; i++)
+ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
+ NFS4_DEVICEID4_SIZE);
+ res->eof = be32_to_cpup(p);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
static int decode_getdeviceinfo(struct xdr_stream *xdr,
struct pnfs_device *pdev)
@@ -5003,6 +5383,56 @@ out_overflow:
print_overflow_msg(__func__, xdr);
return -EIO;
}
+
+static int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->lrs_present = be32_to_cpup(p);
+ if (res->lrs_present)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_layoutcommit(struct xdr_stream *xdr,
+ struct rpc_rqst *req,
+ struct nfs4_layoutcommit_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+ res->status = status;
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->sizechanged = be32_to_cpup(p);
+
+ if (res->sizechanged) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, &res->newsize);
+ }
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
#endif /* CONFIG_NFS_V4_1 */
/*
@@ -6019,6 +6449,32 @@ static int nfs4_xdr_dec_reclaim_complete
}
/*
+ * Decode GETDEVICELIST response
+ */
+static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs4_getdevicelist_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ dprintk("encoding getdevicelist!\n");
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status != 0)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status != 0)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status != 0)
+ goto out;
+ status = decode_getdevicelist(xdr, res->devlist);
+out:
+ return status;
+}
+
+/*
* Decode GETDEVINFO response
*/
static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -6062,6 +6518,108 @@ static int nfs4_xdr_dec_layoutget(struct
out:
return status;
}
+
+/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutreturn(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTCOMMIT response
+ */
+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutcommit_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutcommit(xdr, rqstp, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
+out:
+ return status;
+}
+
+/*
+ * Decode pNFS File Layout Data Server WRITE response
+ */
+static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs_writeres *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_write(xdr, res);
+ if (!status)
+ return res->count;
+out:
+ return status;
+}
+
+/*
+ * Decode pNFS File Layout Data Server COMMIT response
+ */
+static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs_writeres *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_commit(xdr, res);
+out:
+ return status;
+}
#endif /* CONFIG_NFS_V4_1 */
/**
@@ -6081,7 +6639,7 @@ out:
int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
int plus)
{
- uint32_t bitmap[2] = {0};
+ uint32_t bitmap[3] = {0};
uint32_t len;
__be32 *p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
@@ -6263,8 +6821,13 @@ struct rpc_procinfo nfs4_procedures[] =
PROC(SEQUENCE, enc_sequence, dec_sequence),
PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
+ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
+ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
+ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
+ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite),
+ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit),
#endif /* CONFIG_NFS_V4_1 */
};
diff -up linux-2.6.38.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.38.noarch/fs/nfs/objlayout/Kbuild
--- linux-2.6.38.noarch/fs/nfs/objlayout/Kbuild.orig 2011-03-26 07:57:44.262821413 -0400
+++ linux-2.6.38.noarch/fs/nfs/objlayout/Kbuild 2011-03-26 07:57:44.262821413 -0400
@@ -0,0 +1,11 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
+
+#
+# Panasas pNFS Layout Driver kernel module
+#
+panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o
+obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o
diff -up linux-2.6.38.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.38.noarch/fs/nfs/objlayout/objio_osd.c
--- linux-2.6.38.noarch/fs/nfs/objlayout/objio_osd.c.orig 2011-03-26 07:57:44.263821404 -0400
+++ linux-2.6.38.noarch/fs/nfs/objlayout/objio_osd.c 2011-03-26 07:57:44.263821404 -0400
@@ -0,0 +1,1060 @@
+/*
+ * objio_osd.c
+ *
+ * pNFS Objects layout implementation over open-osd initiator library
+ *
+ * Copyright (C) 2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <bharrosh@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <scsi/scsi_device.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+#include <scsi/osd_sense.h>
+
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+#define _LLU(x) ((unsigned long long)x)
+
+enum { BIO_MAX_PAGES_KMALLOC =
+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+
+/* A per mountpoint struct currently for device cache */
+struct objio_mount_type {
+ struct list_head dev_list;
+ spinlock_t dev_list_lock;
+};
+
+struct _dev_ent {
+ struct list_head list;
+ struct nfs4_deviceid d_id;
+ struct osd_dev *od;
+};
+
+static void _dev_list_remove_all(struct objio_mount_type *omt)
+{
+ spin_lock(&omt->dev_list_lock);
+
+ while (!list_empty(&omt->dev_list)) {
+ struct _dev_ent *de = list_entry(omt->dev_list.next,
+ struct _dev_ent, list);
+
+ list_del_init(&de->list);
+ osduld_put_device(de->od);
+ kfree(de);
+ }
+
+ spin_unlock(&omt->dev_list_lock);
+}
+
+static struct osd_dev *___dev_list_find(struct objio_mount_type *omt,
+ struct nfs4_deviceid *d_id)
+{
+ struct list_head *le;
+
+ list_for_each(le, &omt->dev_list) {
+ struct _dev_ent *de = list_entry(le, struct _dev_ent, list);
+
+ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id)))
+ return de->od;
+ }
+
+ return NULL;
+}
+
+static struct osd_dev *_dev_list_find(struct objio_mount_type *omt,
+ struct nfs4_deviceid *d_id)
+{
+ struct osd_dev *od;
+
+ spin_lock(&omt->dev_list_lock);
+ od = ___dev_list_find(omt, d_id);
+ spin_unlock(&omt->dev_list_lock);
+ return od;
+}
+
+static int _dev_list_add(struct objio_mount_type *omt,
+ struct nfs4_deviceid *d_id, struct osd_dev *od)
+{
+ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL);
+
+ if (!de)
+ return -ENOMEM;
+
+ spin_lock(&omt->dev_list_lock);
+
+ if (___dev_list_find(omt, d_id)) {
+ kfree(de);
+ goto out;
+ }
+
+ de->d_id = *d_id;
+ de->od = od;
+ list_add(&de->list, &omt->dev_list);
+
+out:
+ spin_unlock(&omt->dev_list_lock);
+ return 0;
+}
+
+struct objio_segment {
+ struct pnfs_osd_layout *layout;
+
+ unsigned mirrors_p1;
+ unsigned stripe_unit;
+ unsigned group_width; /* Data stripe_units without integrity comps */
+ u64 group_depth;
+ unsigned group_count;
+
+ unsigned num_comps;
+ /* variable length */
+ struct osd_dev *ods[1];
+};
+
+struct objio_state;
+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+
+struct objio_state {
+ /* Generic layer */
+ struct objlayout_io_state ol_state;
+
+ struct objio_segment *objio_seg;
+
+ struct kref kref;
+ objio_done_fn done;
+ void *private;
+
+ unsigned long length;
+ unsigned numdevs; /* Actually used devs in this IO */
+ /* A per-device variable array of size numdevs */
+ struct _objio_per_comp {
+ struct bio *bio;
+ struct osd_request *or;
+ unsigned long length;
+ u64 offset;
+ unsigned dev;
+ } per_dev[];
+};
+
+/* Send and wait for a get_device_info of devices in the layout,
+ then look them up with the osd_initiator library */
+static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg, unsigned comp)
+{
+ struct pnfs_osd_layout *layout = objio_seg->layout;
+ struct pnfs_osd_deviceaddr *deviceaddr;
+ struct nfs4_deviceid *d_id;
+ struct osd_dev *od;
+ struct osd_dev_info odi;
+ struct objio_mount_type *omt = NFS_SERVER(pnfslay->plh_inode)->pnfs_ld_data;
+ int err;
+
+ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id;
+
+ od = _dev_list_find(omt, d_id);
+ if (od)
+ return od;
+
+ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr);
+ if (unlikely(err)) {
+ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err);
+ return ERR_PTR(err);
+ }
+
+ odi.systemid_len = deviceaddr->oda_systemid.len;
+ if (odi.systemid_len > sizeof(odi.systemid)) {
+ err = -EINVAL;
+ goto out;
+ } else if (odi.systemid_len)
+ memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+ odi.systemid_len);
+ odi.osdname_len = deviceaddr->oda_osdname.len;
+ odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
+
+ if (!odi.osdname_len && !odi.systemid_len) {
+ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+ __func__);
+ err = -ENODEV;
+ goto out;
+ }
+
+ od = osduld_info_lookup(&odi);
+ if (unlikely(IS_ERR(od))) {
+ err = PTR_ERR(od);
+ dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+ goto out;
+ }
+
+ _dev_list_add(omt, d_id, od);
+
+out:
+ dprintk("%s: return=%d\n", __func__, err);
+ objlayout_put_deviceinfo(deviceaddr);
+ return err ? ERR_PTR(err) : od;
+}
+
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg)
+{
+ struct pnfs_osd_layout *layout = objio_seg->layout;
+ unsigned i, num_comps = layout->olo_num_comps;
+ int err;
+
+ /* lookup all devices */
+ for (i = 0; i < num_comps; i++) {
+ struct osd_dev *od;
+
+ od = _device_lookup(pnfslay, objio_seg, i);
+ if (unlikely(IS_ERR(od))) {
+ err = PTR_ERR(od);
+ goto out;
+ }
+ objio_seg->ods[i] = od;
+ }
+ objio_seg->num_comps = num_comps;
+ err = 0;
+
+out:
+ dprintk("%s: return=%d\n", __func__, err);
+ return err;
+}
+
+static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
+ struct pnfs_osd_data_map *data_map = &layout->olo_map;
+ u64 stripe_length;
+ u32 group_width;
+
+/* FIXME: Only raid0 for now. if not go through MDS */
+ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
+ printk(KERN_ERR "Only RAID_0 for now\n");
+ return -ENOTSUPP;
+ }
+ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
+ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
+ data_map->odm_num_comps, data_map->odm_mirror_cnt);
+ return -EINVAL;
+ }
+
+ if (data_map->odm_group_width)
+ group_width = data_map->odm_group_width;
+ else
+ group_width = data_map->odm_num_comps /
+ (data_map->odm_mirror_cnt + 1);
+
+ stripe_length = (u64)data_map->odm_stripe_unit * group_width;
+ if (stripe_length >= (1ULL << 32)) {
+ printk(KERN_ERR "Total Stripe length(0x%llx)"
+ " >= 32bit is not supported\n", _LLU(stripe_length));
+ return -ENOTSUPP;
+ }
+
+ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+ printk(KERN_ERR "Stripe Unit(0x%llx)"
+ " must be Multples of PAGE_SIZE(0x%lx)\n",
+ _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+ return -ENOTSUPP;
+ }
+
+ return 0;
+}
+
+int objio_alloc_lseg(void **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_segment *lseg,
+ struct pnfs_osd_layout *layout)
+{
+ struct objio_segment *objio_seg;
+ int err;
+
+ err = _verify_data_map(layout);
+ if (unlikely(err))
+ return err;
+
+ objio_seg = kzalloc(sizeof(*objio_seg) +
+ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]),
+ GFP_KERNEL);
+ if (!objio_seg)
+ return -ENOMEM;
+
+ objio_seg->layout = layout;
+ err = objio_devices_lookup(pnfslay, objio_seg);
+ if (err)
+ goto free_seg;
+
+ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1;
+ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit;
+ if (layout->olo_map.odm_group_width) {
+ objio_seg->group_width = layout->olo_map.odm_group_width;
+ objio_seg->group_depth = layout->olo_map.odm_group_depth;
+ objio_seg->group_count = layout->olo_map.odm_num_comps /
+ objio_seg->mirrors_p1 /
+ objio_seg->group_width;
+ } else {
+ objio_seg->group_width = layout->olo_map.odm_num_comps /
+ objio_seg->mirrors_p1;
+ objio_seg->group_depth = -1;
+ objio_seg->group_count = 1;
+ }
+
+ *outp = objio_seg;
+ return 0;
+
+free_seg:
+ dprintk("%s: Error: return %d\n", __func__, err);
+ kfree(objio_seg);
+ *outp = NULL;
+ return err;
+}
+
+void objio_free_lseg(void *p)
+{
+ struct objio_segment *objio_seg = p;
+
+ kfree(objio_seg);
+}
+
+int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
+{
+ struct objio_segment *objio_seg = seg;
+ struct objio_state *ios;
+ const unsigned first_size = sizeof(*ios) +
+ objio_seg->num_comps * sizeof(ios->per_dev[0]);
+ const unsigned sec_size = objio_seg->num_comps *
+ sizeof(ios->ol_state.ioerrs[0]);
+
+ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps);
+ ios = kzalloc(first_size + sec_size, GFP_KERNEL);
+ if (unlikely(!ios))
+ return -ENOMEM;
+
+ ios->objio_seg = objio_seg;
+ ios->ol_state.ioerrs = ((void *)ios) + first_size;
+ ios->ol_state.num_comps = objio_seg->num_comps;
+
+ *outp = &ios->ol_state;
+ return 0;
+}
+
+void objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+
+ kfree(ios);
+}
+
+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+ switch (oep) {
+ case OSD_ERR_PRI_NO_ERROR:
+ return (enum pnfs_osd_errno)0;
+
+ case OSD_ERR_PRI_CLEAR_PAGES:
+ BUG_ON(1);
+ return 0;
+
+ case OSD_ERR_PRI_RESOURCE:
+ return PNFS_OSD_ERR_RESOURCE;
+ case OSD_ERR_PRI_BAD_CRED:
+ return PNFS_OSD_ERR_BAD_CRED;
+ case OSD_ERR_PRI_NO_ACCESS:
+ return PNFS_OSD_ERR_NO_ACCESS;
+ case OSD_ERR_PRI_UNREACHABLE:
+ return PNFS_OSD_ERR_UNREACHABLE;
+ case OSD_ERR_PRI_NOT_FOUND:
+ return PNFS_OSD_ERR_NOT_FOUND;
+ case OSD_ERR_PRI_NO_SPACE:
+ return PNFS_OSD_ERR_NO_SPACE;
+ default:
+ WARN_ON(1);
+ /* fallthrough */
+ case OSD_ERR_PRI_EIO:
+ return PNFS_OSD_ERR_EIO;
+ }
+}
+
+static void _clear_bio(struct bio *bio)
+{
+ struct bio_vec *bv;
+ unsigned i;
+
+ __bio_for_each_segment(bv, bio, i, 0) {
+ unsigned this_count = bv->bv_len;
+
+ if (likely(PAGE_SIZE == this_count))
+ clear_highpage(bv->bv_page);
+ else
+ zero_user(bv->bv_page, bv->bv_offset, this_count);
+ }
+}
+
+static int _io_check(struct objio_state *ios, bool is_write)
+{
+ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
+ int lin_ret = 0;
+ int i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_sense_info osi;
+ struct osd_request *or = ios->per_dev[i].or;
+ int ret;
+
+ if (!or)
+ continue;
+
+ ret = osd_req_decode_sense(or, &osi);
+ if (likely(!ret))
+ continue;
+
+ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+ /* start read offset passed endof file */
+ BUG_ON(is_write);
+ _clear_bio(ios->per_dev[i].bio);
+ dprintk("%s: start read offset passed end of file "
+ "offset=0x%llx, length=0x%lx\n", __func__,
+ _LLU(ios->per_dev[i].offset),
+ ios->per_dev[i].length);
+
+ continue; /* we recovered */
+ }
+ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev,
+ osd_pri_2_pnfs_err(osi.osd_err_pri),
+ ios->per_dev[i].offset,
+ ios->per_dev[i].length,
+ is_write);
+
+ if (osi.osd_err_pri >= oep) {
+ oep = osi.osd_err_pri;
+ lin_ret = ret;
+ }
+ }
+
+ return lin_ret;
+}
+
+/*
+ * Common IO state helpers.
+ */
+static void _io_free(struct objio_state *ios)
+{
+ unsigned i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct _objio_per_comp *per_dev = &ios->per_dev[i];
+
+ if (per_dev->or) {
+ osd_end_request(per_dev->or);
+ per_dev->or = NULL;
+ }
+
+ if (per_dev->bio) {
+ bio_put(per_dev->bio);
+ per_dev->bio = NULL;
+ }
+ }
+}
+
+struct osd_dev * _io_od(struct objio_state *ios, unsigned dev)
+{
+ unsigned min_dev = ios->objio_seg->layout->olo_comps_index;
+ unsigned max_dev = min_dev + ios->ol_state.num_comps;
+
+ BUG_ON(dev < min_dev || max_dev <= dev);
+ return ios->objio_seg->ods[dev - min_dev];
+}
+
+struct _striping_info {
+ u64 obj_offset;
+ u64 group_length;
+ u64 total_group_length;
+ u64 Major;
+ unsigned dev;
+ unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
+ struct _striping_info *si)
+{
+ u32 stripe_unit = ios->objio_seg->stripe_unit;
+ u32 group_width = ios->objio_seg->group_width;
+ u64 group_depth = ios->objio_seg->group_depth;
+ u32 U = stripe_unit * group_width;
+
+ u64 T = U * group_depth;
+ u64 S = T * ios->objio_seg->group_count;
+ u64 M = div64_u64(file_offset, S);
+
+ /*
+ G = (L - (M * S)) / T
+ H = (L - (M * S)) % T
+ */
+ u64 LmodU = file_offset - M * S;
+ u32 G = div64_u64(LmodU, T);
+ u64 H = LmodU - G * T;
+
+ u32 N = div_u64(H, U);
+
+ div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+ si->obj_offset = si->unit_off + (N * stripe_unit) +
+ (M * group_depth * stripe_unit);
+
+ /* "H - (N * U)" is just "H % U" so it's bound to u32 */
+ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+ si->dev *= ios->objio_seg->mirrors_p1;
+
+ si->group_length = T - H;
+ si->total_group_length = T;
+ si->Major = M;
+}
+
+static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len)
+{
+ unsigned pg = *cur_pg;
+ struct request_queue *q =
+ osd_request_queue(_io_od(ios, per_dev->dev));
+
+ per_dev->length += cur_len;
+
+ if (per_dev->bio == NULL) {
+ unsigned stripes = ios->ol_state.num_comps /
+ ios->objio_seg->mirrors_p1;
+ unsigned pages_in_stripe = stripes *
+ (ios->objio_seg->stripe_unit / PAGE_SIZE);
+ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+ stripes;
+
+ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+ if (unlikely(!per_dev->bio)) {
+ dprintk("Faild to allocate BIO size=%u\n", bio_size);
+ return -ENOMEM;
+ }
+ }
+
+ while (cur_len > 0) {
+ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+ unsigned added_len;
+
+ BUG_ON(ios->ol_state.nr_pages <= pg);
+ cur_len -= pglen;
+
+ added_len = bio_add_pc_page(q, per_dev->bio,
+ ios->ol_state.pages[pg], pglen, pgbase);
+ if (unlikely(pglen != added_len))
+ return -ENOMEM;
+ pgbase = 0;
+ ++pg;
+ }
+ BUG_ON(cur_len);
+
+ *cur_pg = pg;
+ return 0;
+}
+
+static int _prepare_one_group(struct objio_state *ios, u64 length,
+ struct _striping_info *si, unsigned first_comp,
+ unsigned *last_pg)
+{
+ unsigned stripe_unit = ios->objio_seg->stripe_unit;
+ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1;
+ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1;
+ unsigned dev = si->dev;
+ unsigned first_dev = dev - (dev % devs_in_group);
+ unsigned comp = first_comp + (dev - first_dev);
+ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+ unsigned cur_pg = *last_pg;
+ int ret = 0;
+
+ while (length) {
+ struct _objio_per_comp *per_dev = &ios->per_dev[comp];
+ unsigned cur_len, page_off = 0;
+
+ if (!per_dev->length) {
+ per_dev->dev = dev;
+ if (dev < si->dev) {
+ per_dev->offset = si->obj_offset + stripe_unit -
+ si->unit_off;
+ cur_len = stripe_unit;
+ } else if (dev == si->dev) {
+ per_dev->offset = si->obj_offset;
+ cur_len = stripe_unit - si->unit_off;
+ page_off = si->unit_off & ~PAGE_MASK;
+ BUG_ON(page_off &&
+ (page_off != ios->ol_state.pgbase));
+ } else { /* dev > si->dev */
+ per_dev->offset = si->obj_offset - si->unit_off;
+ cur_len = stripe_unit;
+ }
+
+ if (max_comp < comp)
+ max_comp = comp;
+
+ dev += mirrors_p1;
+ dev = (dev % devs_in_group) + first_dev;
+ } else {
+ cur_len = stripe_unit;
+ }
+ if (cur_len >= length)
+ cur_len = length;
+
+ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+ cur_len);
+ if (unlikely(ret))
+ goto out;
+
+ comp += mirrors_p1;
+ comp = (comp % devs_in_group) + first_comp;
+
+ length -= cur_len;
+ ios->length += cur_len;
+ }
+out:
+ ios->numdevs = max_comp + mirrors_p1;
+ *last_pg = cur_pg;
+ return ret;
+}
+
+static int _io_rw_pagelist(struct objio_state *ios)
+{
+ u64 length = ios->ol_state.count;
+ struct _striping_info si;
+ unsigned devs_in_group = ios->objio_seg->group_width *
+ ios->objio_seg->mirrors_p1;
+ unsigned first_comp = 0;
+ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps;
+ unsigned last_pg = 0;
+ int ret = 0;
+
+ _calc_stripe_info(ios, ios->ol_state.offset, &si);
+ while (length) {
+ if (length < si.group_length)
+ si.group_length = length;
+
+ ret = _prepare_one_group(ios, si.group_length, &si, first_comp,
+ &last_pg);
+ if (unlikely(ret))
+ goto out;
+
+ length -= si.group_length;
+
+ si.group_length = si.total_group_length;
+ si.unit_off = 0;
+ ++si.Major;
+ si.obj_offset = si.Major * ios->objio_seg->stripe_unit *
+ ios->objio_seg->group_depth;
+
+ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
+ si.dev %= num_comps;
+
+ first_comp += devs_in_group;
+ first_comp %= num_comps;
+ }
+
+out:
+ if (!ios->length)
+ return ret;
+
+ return 0;
+}
+
+static ssize_t _sync_done(struct objio_state *ios)
+{
+ struct completion *waiting = ios->private;
+
+ complete(waiting);
+ return 0;
+}
+
+static void _last_io(struct kref *kref)
+{
+ struct objio_state *ios = container_of(kref, struct objio_state, kref);
+
+ ios->done(ios);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+ struct objio_state *ios = p;
+
+ kref_put(&ios->kref, _last_io);
+}
+
+static ssize_t _io_exec(struct objio_state *ios)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ ssize_t status = 0; /* sync status */
+ unsigned i;
+ objio_done_fn saved_done_fn = ios->done;
+ bool sync = ios->ol_state.sync;
+
+ if (sync) {
+ ios->done = _sync_done;
+ ios->private = &wait;
+ }
+
+ kref_init(&ios->kref);
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_request *or = ios->per_dev[i].or;
+
+ if (!or)
+ continue;
+
+ kref_get(&ios->kref);
+ osd_execute_request_async(or, _done_io, ios);
+ }
+
+ kref_put(&ios->kref, _last_io);
+
+ if (sync) {
+ wait_for_completion(&wait);
+ status = saved_done_fn(ios);
+ }
+
+ return status;
+}
+
+/*
+ * read
+ */
+static ssize_t _read_done(struct objio_state *ios)
+{
+ ssize_t status;
+ int ret = _io_check(ios, false);
+
+ _io_free(ios);
+
+ if (likely(!ret))
+ status = ios->length;
+ else
+ status = ret;
+
+ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+ return status;
+}
+
+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+ struct osd_request *or = NULL;
+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+ unsigned dev = per_dev->dev;
+ struct pnfs_osd_object_cred *cred =
+ &ios->objio_seg->layout->olo_comps[dev];
+ struct osd_obj_id obj = {
+ .partition = cred->oc_object_id.oid_partition_id,
+ .id = cred->oc_object_id.oid_object_id,
+ };
+ int ret;
+
+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ per_dev->or = or;
+
+ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
+
+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ if (ret) {
+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
+ __func__, ret);
+ goto err;
+ }
+
+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+ per_dev->length);
+
+err:
+ return ret;
+}
+
+static ssize_t _read_exec(struct objio_state *ios)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
+ if (!ios->per_dev[i].length)
+ continue;
+ ret = _read_mirrors(ios, i);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ ios->done = _read_done;
+ return _io_exec(ios); /* In sync mode exec returns the io status */
+
+err:
+ _io_free(ios);
+ return ret;
+}
+
+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+ int ret;
+
+ ret = _io_rw_pagelist(ios);
+ if (unlikely(ret))
+ return ret;
+
+ return _read_exec(ios);
+}
+
+/*
+ * write
+ */
+static ssize_t _write_done(struct objio_state *ios)
+{
+ ssize_t status;
+ int ret = _io_check(ios, true);
+
+ _io_free(ios);
+
+ if (likely(!ret)) {
+ /* FIXME: should be based on the OSD's persistence model
+ * See OSD2r05 Section 4.13 Data persistence model */
+ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC;
+ status = ios->length;
+ } else {
+ status = ret;
+ }
+
+ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+ return status;
+}
+
+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+ unsigned dev = ios->per_dev[cur_comp].dev;
+ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1;
+ int ret;
+
+ for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+ struct osd_request *or = NULL;
+ struct pnfs_osd_object_cred *cred =
+ &ios->objio_seg->layout->olo_comps[dev];
+ struct osd_obj_id obj = {
+ .partition = cred->oc_object_id.oid_partition_id,
+ .id = cred->oc_object_id.oid_object_id,
+ };
+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+ struct bio *bio;
+
+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ per_dev->or = or;
+
+ if (per_dev != master_dev) {
+ bio = bio_kmalloc(GFP_KERNEL,
+ master_dev->bio->bi_max_vecs);
+ if (unlikely(!bio)) {
+ dprintk("Faild to allocate BIO size=%u\n",
+ master_dev->bio->bi_max_vecs);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ __bio_clone(bio, master_dev->bio);
+ bio->bi_bdev = NULL;
+ bio->bi_next = NULL;
+ per_dev->bio = bio;
+ per_dev->dev = dev;
+ per_dev->length = master_dev->length;
+ per_dev->offset = master_dev->offset;
+ } else {
+ bio = master_dev->bio;
+ /* FIXME: bio_set_dir() */
+ bio->bi_rw |= REQ_WRITE;
+ }
+
+ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+
+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ if (ret) {
+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
+ __func__, ret);
+ goto err;
+ }
+
+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+ per_dev->length);
+ }
+
+err:
+ return ret;
+}
+
+static ssize_t _write_exec(struct objio_state *ios)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
+ if (!ios->per_dev[i].length)
+ continue;
+ ret = _write_mirrors(ios, i);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ ios->done = _write_done;
+ return _io_exec(ios); /* In sync mode exec returns the io->status */
+
+err:
+ _io_free(ios);
+ return ret;
+}
+
+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+ int ret;
+
+ /* TODO: ios->stable = stable; */
+ ret = _io_rw_pagelist(ios);
+ if (unlikely(ret))
+ return ret;
+
+ return _write_exec(ios);
+}
+
+/*
+ * Policy Operations
+ */
+
+/*
+ * Get the max [rw]size
+ */
+static ssize_t
+objlayout_get_blocksize(void)
+{
+ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE;
+
+ return sz;
+}
+
+/*
+ * Don't gather across stripes, but rather gather (coalesce) up to
+ * the stripe size.
+ *
+ * FIXME: change interface to use merge_align, merge_count
+ */
+static struct pnfs_layoutdriver_type objlayout_type = {
+ .id = LAYOUT_OSD2_OBJECTS,
+ .name = "LAYOUT_OSD2_OBJECTS",
+ .flags = PNFS_LAYOUTRET_ON_SETATTR,
+
+ .set_layoutdriver = objlayout_set_layoutdriver,
+ .clear_layoutdriver = objlayout_clear_layoutdriver,
+
+ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
+ .free_layout_hdr = objlayout_free_layout_hdr,
+
+ .alloc_lseg = objlayout_alloc_lseg,
+ .free_lseg = objlayout_free_lseg,
+
+ .get_blocksize = objlayout_get_blocksize,
+
+ .read_pagelist = objlayout_read_pagelist,
+ .write_pagelist = objlayout_write_pagelist,
+ .commit = objlayout_commit,
+
+ .encode_layoutcommit = objlayout_encode_layoutcommit,
+ .encode_layoutreturn = objlayout_encode_layoutreturn,
+};
+
+void *objio_init_mt(void)
+{
+ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL);
+
+ if (!omt)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&omt->dev_list);
+ spin_lock_init(&omt->dev_list_lock);
+ return omt;
+}
+
+void objio_fini_mt(void *mountid)
+{
+ _dev_list_remove_all(mountid);
+ kfree(mountid);
+}
+
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+
+static int __init
+objlayout_init(void)
+{
+ int ret = pnfs_register_layoutdriver(&objlayout_type);
+
+ if (ret)
+ printk(KERN_INFO
+ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+ __func__, ret);
+ else
+ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+ __func__);
+ return ret;
+}
+
+static void __exit
+objlayout_exit(void)
+{
+ pnfs_unregister_layoutdriver(&objlayout_type);
+ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+ __func__);
+}
+
+module_init(objlayout_init);
+module_exit(objlayout_exit);
diff -up linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.c
--- linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.c.orig 2011-03-26 07:57:44.265821386 -0400
+++ linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.c 2011-03-26 07:57:44.265821386 -0400
@@ -0,0 +1,773 @@
+/*
+ * objlayout.c
+ *
+ * pNFS layout driver for Panasas OSDs
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+struct pnfs_client_operations *pnfs_client_ops;
+
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode)
+{
+ struct objlayout *objlay;
+
+ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL);
+ if (objlay) {
+ spin_lock_init(&objlay->lock);
+ INIT_LIST_HEAD(&objlay->err_list);
+ }
+ dprintk("%s: Return %p\n", __func__, objlay);
+ return &objlay->pnfs_layout;
+}
+
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct objlayout *objlay = OBJLAYOUT(lo);
+
+ dprintk("%s: objlay %p\n", __func__, objlay);
+
+ WARN_ON(!list_empty(&objlay->err_list));
+ kfree(objlay);
+}
+
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_layoutget_res *lgr)
+{
+ int status;
+ void *layout = lgr->layout.buf;
+ struct objlayout_segment *objlseg;
+ struct pnfs_osd_layout *pnfs_osd_layout;
+
+ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout);
+
+ BUG_ON(!layout);
+
+ status = -ENOMEM;
+ objlseg = kzalloc(sizeof(*objlseg) +
+ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL);
+ if (!objlseg)
+ goto err;
+
+ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout;
+ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout);
+
+ objlseg->lseg.pls_range = lgr->range;
+ status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg,
+ pnfs_osd_layout);
+ if (status)
+ goto err;
+
+ dprintk("%s: Return %p\n", __func__, &objlseg->lseg);
+ return &objlseg->lseg;
+
+ err:
+ kfree(objlseg);
+ return ERR_PTR(status);
+}
+
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct objlayout_segment *objlseg;
+
+ dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
+ if (unlikely(!lseg))
+ return;
+
+ objlseg = container_of(lseg, struct objlayout_segment, lseg);
+ objio_free_lseg(objlseg->internal);
+ kfree(objlseg);
+}
+
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+ struct page **pages,
+ unsigned pgbase,
+ unsigned nr_pages,
+ loff_t offset,
+ size_t count,
+ struct pnfs_layout_segment *lseg,
+ void *rpcdata)
+{
+ struct objlayout_segment *objlseg =
+ container_of(lseg, struct objlayout_segment, lseg);
+ struct objlayout_io_state *state;
+ u64 lseg_end_offset;
+ size_t size_nr_pages;
+
+ dprintk("%s: allocating io_state\n", __func__);
+ if (objio_alloc_io_state(objlseg->internal, &state))
+ return NULL;
+
+ BUG_ON(offset < lseg->pls_range.offset);
+ lseg_end_offset = end_offset(lseg->pls_range.offset, lseg->pls_range.length);
+ BUG_ON(offset >= lseg_end_offset);
+ if (offset + count > lseg_end_offset) {
+ count = lseg->pls_range.length - (offset - lseg->pls_range.offset);
+ dprintk("%s: truncated count %Zd\n", __func__, count);
+ }
+
+ if (pgbase > PAGE_SIZE) {
+ unsigned n = pgbase >> PAGE_SHIFT;
+
+ pgbase &= ~PAGE_MASK;
+ pages += n;
+ nr_pages -= n;
+ }
+
+ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ BUG_ON(nr_pages < size_nr_pages);
+ if (nr_pages > size_nr_pages)
+ nr_pages = size_nr_pages;
+
+ INIT_LIST_HEAD(&state->err_list);
+ state->objlseg = objlseg;
+ state->rpcdata = rpcdata;
+ state->pages = pages;
+ state->pgbase = pgbase;
+ state->nr_pages = nr_pages;
+ state->offset = offset;
+ state->count = count;
+ state->sync = 0;
+
+ return state;
+}
+
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+ dprintk("%s: freeing io_state\n", __func__);
+ if (unlikely(!state))
+ return;
+
+ objio_free_io_state(state);
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+ dprintk("%s: state %p status\n", __func__, state);
+
+ if (likely(state->status >= 0)) {
+ objlayout_free_io_state(state);
+ } else {
+ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.pls_layout);
+
+ spin_lock(&objlay->lock);
+ objlay->delta_space_valid = OBJ_DSU_INVALID;
+ list_add(&objlay->err_list, &state->err_list);
+ spin_unlock(&objlay->lock);
+ }
+}
+
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+ int osd_error, u64 offset, u64 length, bool is_write)
+{
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+
+ BUG_ON(index >= state->num_comps);
+ if (osd_error) {
+ struct pnfs_osd_layout *layout =
+ (typeof(layout))state->objlseg->pnfs_osd_layout;
+
+ ioerr->oer_component = layout->olo_comps[index].oc_object_id;
+ ioerr->oer_comp_offset = offset;
+ ioerr->oer_comp_length = length;
+ ioerr->oer_iswrite = is_write;
+ ioerr->oer_errno = osd_error;
+
+ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+ __func__, index, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+ } else {
+ /* User need not call if no error is reported */
+ ioerr->oer_errno = 0;
+ }
+}
+
+static void _rpc_commit_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+
+ pnfs_commit_done(wdata);
+}
+
+/*
+ * Commit data remotely on OSDs
+ */
+enum pnfs_try_status
+objlayout_commit(struct nfs_write_data *wdata, int how)
+{
+ int status = PNFS_ATTEMPTED;
+
+ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete);
+ schedule_work(&wdata->task.u.tk_work);
+ dprintk("%s: Return %d\n", __func__, status);
+ return status;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+
+ pnfs_read_done(rdata);
+}
+
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+ int eof = state->eof;
+ struct nfs_read_data *rdata;
+
+ state->status = status;
+ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+ rdata = state->rpcdata;
+ rdata->task.tk_status = status;
+ if (status >= 0) {
+ rdata->res.count = status;
+ rdata->res.eof = eof;
+ }
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_read_done(rdata);
+ else {
+ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+ schedule_work(&rdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages)
+{
+ loff_t offset = rdata->args.offset;
+ size_t count = rdata->args.count;
+ struct objlayout_io_state *state;
+ ssize_t status = 0;
+ loff_t eof;
+
+ dprintk("%s: Begin inode %p offset %llu count %d\n",
+ __func__, rdata->inode, offset, (int)count);
+
+ eof = i_size_read(rdata->inode);
+ if (unlikely(offset + count > eof)) {
+ if (offset >= eof) {
+ status = 0;
+ rdata->res.count = 0;
+ rdata->res.eof = 1;
+ goto out;
+ }
+ count = eof - offset;
+ }
+
+ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+ rdata->args.pages, rdata->args.pgbase,
+ nr_pages, offset, count,
+ rdata->pdata.lseg, rdata);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->eof = state->offset + state->count >= eof;
+
+ status = objio_read_pagelist(state);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ rdata->pdata.pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+
+ pnfs_writeback_done(wdata);
+}
+
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+ bool sync)
+{
+ struct nfs_write_data *wdata;
+
+ dprintk("%s: Begin\n", __func__);
+ wdata = state->rpcdata;
+ state->status = status;
+ wdata->task.tk_status = status;
+ if (status >= 0) {
+ wdata->res.count = status;
+ wdata->verf.committed = state->committed;
+ dprintk("%s: Return status %d committed %d\n",
+ __func__, wdata->task.tk_status,
+ wdata->verf.committed);
+ } else
+ dprintk("%s: Return status %d\n",
+ __func__, wdata->task.tk_status);
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_writeback_done(wdata);
+ else {
+ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+ schedule_work(&wdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+ unsigned nr_pages,
+ int how)
+{
+ struct objlayout_io_state *state;
+ ssize_t status;
+
+ dprintk("%s: Begin inode %p offset %llu count %u\n",
+ __func__, wdata->inode, wdata->args.offset, wdata->args.count);
+
+ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+ wdata->args.pages,
+ wdata->args.pgbase,
+ nr_pages,
+ wdata->args.offset,
+ wdata->args.count,
+ wdata->pdata.lseg, wdata);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->sync = how & FLUSH_SYNC;
+
+ status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ wdata->pdata.pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *args)
+{
+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
+ struct pnfs_osd_layoutupdate lou;
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+
+ spin_lock(&objlay->lock);
+ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+ lou.dsu_delta = objlay->delta_space_used;
+ objlay->delta_space_used = 0;
+ objlay->delta_space_valid = OBJ_DSU_INIT;
+ lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+ spin_unlock(&objlay->lock);
+
+ start = xdr_reserve_space(xdr, 4);
+
+ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+
+ dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+ lou.dsu_delta, lou.olu_ioerr_flag);
+}
+
+static int
+err_prio(u32 oer_errno)
+{
+ switch (oer_errno) {
+ case 0:
+ return 0;
+
+ case PNFS_OSD_ERR_RESOURCE:
+ return OSD_ERR_PRI_RESOURCE;
+ case PNFS_OSD_ERR_BAD_CRED:
+ return OSD_ERR_PRI_BAD_CRED;
+ case PNFS_OSD_ERR_NO_ACCESS:
+ return OSD_ERR_PRI_NO_ACCESS;
+ case PNFS_OSD_ERR_UNREACHABLE:
+ return OSD_ERR_PRI_UNREACHABLE;
+ case PNFS_OSD_ERR_NOT_FOUND:
+ return OSD_ERR_PRI_NOT_FOUND;
+ case PNFS_OSD_ERR_NO_SPACE:
+ return OSD_ERR_PRI_NO_SPACE;
+ default:
+ WARN_ON(1);
+ /* fallthrough */
+ case PNFS_OSD_ERR_EIO:
+ return OSD_ERR_PRI_EIO;
+ }
+}
+
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+ const struct pnfs_osd_ioerr *src_err)
+{
+ u64 dest_end, src_end;
+
+ if (!dest_err->oer_errno) {
+ *dest_err = *src_err;
+ /* accumulated device must be blank */
+ memset(&dest_err->oer_component.oid_device_id, 0,
+ sizeof(dest_err->oer_component.oid_device_id));
+
+ return;
+ }
+
+ if (dest_err->oer_component.oid_partition_id !=
+ src_err->oer_component.oid_partition_id)
+ dest_err->oer_component.oid_partition_id = 0;
+
+ if (dest_err->oer_component.oid_object_id !=
+ src_err->oer_component.oid_object_id)
+ dest_err->oer_component.oid_object_id = 0;
+
+ if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+ dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
+ dest_end = end_offset(dest_err->oer_comp_offset,
+ dest_err->oer_comp_length);
+ src_end = end_offset(src_err->oer_comp_offset,
+ src_err->oer_comp_length);
+ if (dest_end < src_end)
+ dest_end = src_end;
+
+ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
+ if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+ dest_err->oer_errno = src_err->oer_errno;
+ } else if (src_err->oer_iswrite) {
+ dest_err->oer_iswrite = true;
+ dest_err->oer_errno = src_err->oer_errno;
+ }
+}
+
+static void
+encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr)
+{
+ struct objlayout_io_state *state, *tmp;
+ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ unsigned i;
+
+ for (i = 0; i < state->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+ if (!ioerr->oer_errno)
+ continue;
+
+ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ __func__, i, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+
+ merge_ioerr(&accumulated_err, ioerr);
+ }
+ list_del(&state->err_list);
+ objlayout_free_io_state(state);
+ }
+
+ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err));
+}
+
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
+ struct objlayout_io_state *state, *tmp;
+ __be32 *start, *uninitialized_var(last_xdr);
+
+ dprintk("%s: Begin\n", __func__);
+ start = xdr_reserve_space(xdr, 4);
+ BUG_ON(!start);
+
+ spin_lock(&objlay->lock);
+
+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ unsigned i;
+ int res = 0;
+
+ for (i = 0; i < state->num_comps && !res; i++) {
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+ if (!ioerr->oer_errno)
+ continue;
+
+ dprintk("%s: err[%d]: errno=%d is_write=%d "
+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ __func__, i, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+
+ last_xdr = xdr->p;
+ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]);
+ }
+ if (unlikely(res)) {
+ /* no space for even one error descriptor */
+ BUG_ON(last_xdr == start + 1);
+
+ /* we've encountered a situation with lots and lots of
+ * errors and no space to encode them all. Use the last
+ * available slot to report the union of all the
+ * remaining errors.
+ */
+ xdr_rewind_stream(xdr, last_xdr -
+ pnfs_osd_ioerr_xdr_sz() / 4);
+ encode_accumulated_error(objlay, xdr);
+ goto loop_done;
+ }
+ list_del(&state->err_list);
+ objlayout_free_io_state(state);
+ }
+loop_done:
+ spin_unlock(&objlay->lock);
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+ dprintk("%s: Return\n", __func__);
+}
+
+struct objlayout_deviceinfo {
+ struct page *page;
+ struct pnfs_osd_deviceaddr da; /* This must be last */
+};
+
+/* Initialize and call nfs_getdeviceinfo, then decode and return a
+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
+ * should be called.
+ */
+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr)
+{
+ struct objlayout_deviceinfo *odi;
+ struct pnfs_device pd;
+ struct super_block *sb;
+ struct page *page;
+ size_t sz;
+ u32 *p;
+ int err;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ pd.area = page_address(page);
+
+ memcpy(&pd.dev_id, d_id, sizeof(*d_id));
+ pd.layout_type = LAYOUT_OSD2_OBJECTS;
+ pd.pages = &page;
+ pd.pgbase = 0;
+ pd.pglen = PAGE_SIZE;
+ pd.mincount = 0;
+
+ sb = pnfslay->plh_inode->i_sb;
+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
+ if (err)
+ goto err_out;
+
+ p = pd.area;
+ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p);
+ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL);
+ if (!odi) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
+ odi->page = page;
+ *deviceaddr = &odi->da;
+ return 0;
+
+err_out:
+ __free_page(page);
+ return err;
+}
+
+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+ struct objlayout_deviceinfo *odi = container_of(deviceaddr,
+ struct objlayout_deviceinfo,
+ da);
+
+ __free_page(odi->page);
+ kfree(odi);
+}
+
+/*
+ * Perform the objio specific init_mt method.
+ * Set the layout driver private data pointer for later use.
+ */
+int
+objlayout_set_layoutdriver(struct nfs_server *server,
+ const struct nfs_fh *mntfh)
+{
+ void *data;
+
+ data = objio_init_mt();
+ if (IS_ERR(data)) {
+ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n",
+ __func__, PTR_ERR(data));
+ return PTR_ERR(data);
+ }
+ server->pnfs_ld_data = data;
+
+ dprintk("%s: Return data=%p\n", __func__, data);
+ return 0;
+}
+
+/*
+ * Perform the objio specific fini_mt method to release the
+ * layoutdriver private data.
+ */
+int
+objlayout_clear_layoutdriver(struct nfs_server *server)
+{
+ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data);
+ objio_fini_mt(server->pnfs_ld_data);
+ return 0;
+}
diff -up linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.h
--- linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.h.orig 2011-03-26 07:57:44.265821386 -0400
+++ linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.h 2011-03-26 07:57:44.265821386 -0400
@@ -0,0 +1,206 @@
+/*
+ * objlayout.h
+ *
+ * Data types and function declerations for interfacing with the
+ * pNFS standard object layout driver.
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+
+/*
+ * in-core layout segment
+ */
+struct objlayout_segment {
+ struct pnfs_layout_segment lseg;
+ void *internal; /* for provider internal use */
+ u8 pnfs_osd_layout[];
+};
+
+/*
+ * per-inode layout
+ */
+struct objlayout {
+ struct pnfs_layout_hdr pnfs_layout;
+
+ /* for layout_commit */
+ enum osd_delta_space_valid_enum {
+ OBJ_DSU_INIT = 0,
+ OBJ_DSU_VALID,
+ OBJ_DSU_INVALID,
+ } delta_space_valid;
+ s64 delta_space_used; /* consumed by write ops */
+
+ /* for layout_return */
+ spinlock_t lock;
+ struct list_head err_list;
+};
+
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct objlayout, pnfs_layout);
+}
+
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_state {
+ struct objlayout_segment *objlseg;
+
+ struct page **pages;
+ unsigned pgbase;
+ unsigned nr_pages;
+ unsigned long count;
+ loff_t offset;
+ bool sync;
+
+ void *rpcdata;
+ int status; /* res */
+ int eof; /* res */
+ int committed; /* res */
+
+ /* Error reporting (layout_return) */
+ struct list_head err_list;
+ unsigned num_comps;
+ /* Pointer to array of error descriptors of size num_comps.
+ * It should contain as many entries as devices in the osd_layout
+ * that participate in the I/O. It is up to the io_engine to allocate
+ * needed space and set num_comps.
+ */
+ struct pnfs_osd_ioerr *ioerrs;
+};
+
+/*
+ * Raid engine I/O API
+ */
+extern void *objio_init_mt(void);
+extern void objio_fini_mt(void *mt);
+
+extern int objio_alloc_lseg(void **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_segment *lseg,
+ struct pnfs_osd_layout *layout);
+extern void objio_free_lseg(void *p);
+
+extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp);
+extern void objio_free_io_state(struct objlayout_io_state *state);
+
+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+ bool stable);
+
+/*
+ * callback API
+ */
+extern void objlayout_io_set_result(struct objlayout_io_state *state,
+ unsigned index, int osd_error,
+ u64 offset, u64 length, bool is_write);
+
+static inline void
+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
+ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.pls_layout);
+
+ /* If one of the I/Os errored out and the delta_space_used was
+ * invalid we render the complete report as invalid. Protocol mandate
+ * the DSU be accurate or not reported.
+ */
+ spin_lock(&objlay->lock);
+ if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+ objlay->delta_space_valid = OBJ_DSU_VALID;
+ objlay->delta_space_used += space_used;
+ }
+ spin_unlock(&objlay->lock);
+}
+
+extern void objlayout_read_done(struct objlayout_io_state *state,
+ ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_state *state,
+ ssize_t status, bool sync);
+
+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr);
+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
+
+/*
+ * exported generic objects function vectors
+ */
+
+extern int objlayout_set_layoutdriver(
+ struct nfs_server *,
+ const struct nfs_fh *);
+extern int objlayout_clear_layoutdriver(struct nfs_server *);
+
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+ struct pnfs_layout_hdr *,
+ struct nfs4_layoutget_res *);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+
+extern enum pnfs_try_status objlayout_read_pagelist(
+ struct nfs_read_data *,
+ unsigned nr_pages);
+
+extern enum pnfs_try_status objlayout_write_pagelist(
+ struct nfs_write_data *,
+ unsigned nr_pages,
+ int how);
+
+extern enum pnfs_try_status objlayout_commit(
+ struct nfs_write_data *,
+ int how);
+
+extern void objlayout_encode_layoutcommit(
+ struct pnfs_layout_hdr *,
+ struct xdr_stream *,
+ const struct nfs4_layoutcommit_args *);
+
+extern void objlayout_encode_layoutreturn(
+ struct pnfs_layout_hdr *,
+ struct xdr_stream *,
+ const struct nfs4_layoutreturn_args *);
+
+#endif /* _OBJLAYOUT_H */
diff -up linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.c
--- linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2011-03-26 07:57:44.266821378 -0400
+++ linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.c 2011-03-26 07:57:44.266821378 -0400
@@ -0,0 +1,702 @@
+/*
+ * panfs_shim.c
+ *
+ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * See the file COPYING included with this distribution for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+
+#include "objlayout.h"
+#include "panfs_shim.h"
+
+#include <linux/panfs_shim_api.h>
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+struct panfs_export_operations *panfs_export_ops;
+
+void *
+objio_init_mt(void)
+{
+ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL;
+}
+
+void objio_fini_mt(void *mountid)
+{
+}
+
+static int
+panfs_shim_conv_raid01(struct pnfs_osd_layout *layout,
+ struct pnfs_osd_data_map *lo_map,
+ pan_agg_layout_hdr_t *hdr)
+{
+ if (lo_map->odm_mirror_cnt) {
+ hdr->type = PAN_AGG_RAID1;
+ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1;
+ } else if (layout->olo_num_comps > 1) {
+ hdr->type = PAN_AGG_RAID0;
+ hdr->hdr.raid0.num_comps = layout->olo_num_comps;
+ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit;
+ } else
+ hdr->type = PAN_AGG_SIMPLE;
+ return 0;
+}
+
+static int
+panfs_shim_conv_raid5(struct pnfs_osd_layout *layout,
+ struct pnfs_osd_data_map *lo_map,
+ pan_agg_layout_hdr_t *hdr)
+{
+ if (lo_map->odm_mirror_cnt)
+ goto err;
+
+ if (lo_map->odm_group_width || lo_map->odm_group_depth) {
+ if (!lo_map->odm_group_width || !lo_map->odm_group_depth)
+ goto err;
+
+ hdr->type = PAN_AGG_GRP_RAID5_LEFT;
+ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps;
+ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps)
+ goto err;
+ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit;
+ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width;
+ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth;
+ /* this is a guess, panasas server is not supposed to
+ hand out layotu otherwise */
+ hdr->hdr.grp_raid5_left.group_layout_policy =
+ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN;
+ } else {
+ hdr->type = PAN_AGG_RAID5_LEFT;
+ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps;
+ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps)
+ goto err;
+ hdr->hdr.raid5_left.stripe_unit2 =
+ hdr->hdr.raid5_left.stripe_unit1 =
+ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit;
+ }
+
+ return 0;
+err:
+ return -EINVAL;
+}
+
+/*
+ * Convert a pnfs_osd data map into Panasas aggregation layout header
+ */
+static int
+panfs_shim_conv_pnfs_osd_data_map(
+ struct pnfs_osd_layout *layout,
+ pan_agg_layout_hdr_t *hdr)
+{
+ int status = -EINVAL;
+ struct pnfs_osd_data_map *lo_map = &layout->olo_map;
+
+ if (!layout->olo_num_comps) {
+ dprintk("%s: !!layout.n_comps(%u)\n", __func__,
+ layout->olo_num_comps);
+ goto err;
+ }
+
+ switch (lo_map->odm_raid_algorithm) {
+ case PNFS_OSD_RAID_0:
+ if (layout->olo_num_comps != lo_map->odm_num_comps ||
+ layout->olo_comps_index) {
+ dprintk("%s: !!PNFS_OSD_RAID_0 "
+ "layout.n_comps(%u) map.n_comps(%u) "
+ "comps_index(%u)\n", __func__,
+ layout->olo_num_comps,
+ lo_map->odm_num_comps,
+ layout->olo_comps_index);
+ goto err;
+ }
+ status = panfs_shim_conv_raid01(layout, lo_map, hdr);
+ break;
+
+ case PNFS_OSD_RAID_5:
+ if (!lo_map->odm_group_width) {
+ if (layout->olo_num_comps != lo_map->odm_num_comps ||
+ layout->olo_comps_index) {
+ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width "
+ "layout.n_comps(%u)!=map.n_comps(%u) "
+ "|| comps_index(%u)\n", __func__,
+ layout->olo_num_comps,
+ lo_map->odm_num_comps,
+ layout->olo_comps_index);
+ goto err;
+ }
+ } else if ((layout->olo_num_comps != lo_map->odm_num_comps &&
+ layout->olo_num_comps > lo_map->odm_group_width) ||
+ (layout->olo_comps_index % lo_map->odm_group_width)){
+ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) "
+ "layout.n_comps(%u) map.n_comps(%u) "
+ "comps_index(%u)\n", __func__,
+ lo_map->odm_group_width,
+ layout->olo_num_comps,
+ lo_map->odm_num_comps,
+ layout->olo_comps_index);
+ goto err;
+ }
+ status = panfs_shim_conv_raid5(layout, lo_map, hdr);
+ break;
+
+ case PNFS_OSD_RAID_4:
+ case PNFS_OSD_RAID_PQ:
+ default:
+ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__,
+ lo_map->odm_raid_algorithm);
+ goto err;
+ }
+
+ return 0;
+
+err:
+ return status;
+}
+
+/*
+ * Convert pnfs_osd layout into Panasas map and caps type
+ */
+int
+objio_alloc_lseg(void **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_segment *lseg,
+ struct pnfs_osd_layout *layout)
+{
+ int i, total_comps;
+ int status;
+ struct pnfs_osd_object_cred *lo_comp;
+ pan_size_t alloc_sz, local_sz;
+ pan_sm_map_cap_t *mcs = NULL;
+ u8 *buf;
+ pan_agg_comp_obj_t *pan_comp;
+ pan_sm_sec_t *pan_sec;
+
+ status = -EINVAL;
+ if (layout->olo_num_comps < layout->olo_map.odm_group_width) {
+ total_comps = layout->olo_comps_index + layout->olo_num_comps;
+ } else {
+ /* allocate full map, otherwise SAM gets confused */
+ total_comps = layout->olo_map.odm_num_comps;
+ }
+ alloc_sz = total_comps *
+ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t));
+ for (i = 0; i < layout->olo_num_comps; i++) {
+ void *p = layout->olo_comps[i].oc_cap.cred;
+ if (panfs_export_ops->sm_sec_t_get_size_otw(
+ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL))
+ goto err;
+ alloc_sz += local_sz;
+ }
+
+ status = -ENOMEM;
+ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL);
+ if (!mcs)
+ goto err;
+ buf = (u8 *)&mcs[1];
+
+ mcs->offset = lseg->pls_range.offset;
+ mcs->length = lseg->pls_range.length;
+#if 0
+ /* FIXME: for now */
+ mcs->expiration_time.ts_sec = 0;
+ mcs->expiration_time.ts_nsec = 0;
+#endif
+ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL;
+ status = panfs_shim_conv_pnfs_osd_data_map(layout,
+ &mcs->full_map.layout_hdr);
+ if (status)
+ goto err;
+
+ mcs->full_map.components.size = total_comps;
+ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf;
+ buf += total_comps * sizeof(pan_agg_comp_obj_t);
+
+ mcs->secs.size = total_comps;
+ mcs->secs.data = (pan_sm_sec_t *)buf;
+ buf += total_comps * sizeof(pan_sm_sec_t);
+
+ lo_comp = layout->olo_comps;
+ pan_comp = mcs->full_map.components.data + layout->olo_comps_index;
+ pan_sec = mcs->secs.data + layout->olo_comps_index;
+ for (i = 0; i < layout->olo_num_comps; i++) {
+ void *p;
+ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id;
+ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id;
+ u64 dev_id = __be64_to_cpup(
+ (__be64 *)oc_obj_id->oid_device_id.data + 1);
+
+ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n",
+ __func__, i,
+ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data),
+ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1),
+ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id);
+
+ if (i == 0) {
+ /* make up mgr_id to calm sam down */
+ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0,
+ &obj_id->dev_id);
+ obj_id->grp_id = oc_obj_id->oid_partition_id;
+ obj_id->obj_id = oc_obj_id->oid_object_id;
+ }
+
+ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) {
+ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n",
+ __func__, i, (u64)obj_id->grp_id,
+ lo_comp->oc_object_id.oid_partition_id);
+ status = -EINVAL;
+ goto err;
+ }
+
+ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) {
+ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n",
+ __func__, i, obj_id->obj_id,
+ lo_comp->oc_object_id.oid_object_id);
+ status = -EINVAL;
+ goto err;
+ }
+
+ pan_comp->dev_id = dev_id;
+ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) {
+ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n",
+ __func__, i, obj_id->dev_id);
+ status = -EINVAL;
+ goto err;
+ }
+ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) {
+ dprintk("%s: degraded maps not supported yet\n",
+ __func__);
+ status = -ENOTSUPP;
+ goto err;
+ }
+ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL;
+ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) {
+ dprintk("%s: cap key security not supported yet\n",
+ __func__);
+ status = -ENOTSUPP;
+ goto err;
+ }
+
+ p = lo_comp->oc_cap.cred;
+ panfs_export_ops->sm_sec_t_unmarshall(
+ (pan_sm_sec_otw_t *)&p,
+ pan_sec,
+ buf,
+ alloc_sz,
+ NULL,
+ &local_sz);
+ buf += local_sz;
+ alloc_sz -= local_sz;
+
+ lo_comp++;
+ pan_comp++;
+ pan_sec++;
+ }
+
+ *outp = mcs;
+ dprintk("%s:Return mcs=%p\n", __func__, mcs);
+ return 0;
+
+err:
+ objio_free_lseg(mcs);
+ dprintk("%s:Error %d\n", __func__, status);
+ return status;
+}
+
+/*
+ * Free a Panasas map and caps type
+ */
+void
+objio_free_lseg(void *p)
+{
+ kfree(p);
+}
+
+/*
+ * I/O routines
+ */
+int
+objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
+{
+ struct panfs_shim_io_state *p;
+
+ dprintk("%s: allocating io_state\n", __func__);
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ *outp = &p->ol_state;
+ return 0;
+}
+
+/*
+ * Free an I/O state
+ */
+void
+objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+ struct panfs_shim_io_state *state = container_of(ol_state,
+ struct panfs_shim_io_state, ol_state);
+ int i;
+
+ dprintk("%s: freeing io_state\n", __func__);
+ for (i = 0; i < state->ol_state.nr_pages; i++)
+ kunmap(state->ol_state.pages[i]);
+
+ if (state->ucreds)
+ panfs_export_ops->ucreds_put(state->ucreds);
+ kfree(state->sg_list);
+ kfree(state);
+}
+
+static int
+panfs_shim_pages_to_sg(
+ struct panfs_shim_io_state *state,
+ struct page **pages,
+ unsigned int pgbase,
+ unsigned nr_pages,
+ size_t count)
+{
+ unsigned i, n;
+ pan_sg_entry_t *sg;
+
+ dprintk("%s pgbase %u nr_pages %u count %d "
+ "pg0 %p flags 0x%x index %llu\n",
+ __func__, pgbase, nr_pages, (int)count, pages[0],
+ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index);
+
+ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL);
+ if (sg == NULL)
+ return -ENOMEM;
+
+ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n",
+ __func__, sg, pages, pgbase, nr_pages);
+
+ for (i = 0; i < nr_pages; i++) {
+ sg[i].buffer = (char *)kmap(pages[i]) + pgbase;
+ n = PAGE_SIZE - pgbase;
+ pgbase = 0;
+ if (n > count)
+ n = count;
+ sg[i].chunk_size = n;
+ count -= n;
+ if (likely(count)) {
+ sg[i].next = &sg[i+1];
+ } else {
+ /* we're done */
+ sg[i].next = NULL;
+ break;
+ }
+ }
+ BUG_ON(count);
+
+ state->sg_list = sg;
+ return 0;
+}
+
+/*
+ * Callback function for async reads
+ */
+static void
+panfs_shim_read_done(
+ void *arg1,
+ void *arg2,
+ pan_sam_read_res_t *res_p,
+ pan_status_t rc)
+{
+ struct panfs_shim_io_state *state = arg1;
+ ssize_t status;
+
+ dprintk("%s: Begin\n", __func__);
+ if (!res_p)
+ res_p = &state->u.read.res;
+ if (rc == PAN_SUCCESS)
+ rc = res_p->result;
+ if (rc == PAN_SUCCESS) {
+ status = res_p->length;
+ WARN_ON(status < 0);
+ } else {
+ status = -panfs_export_ops->convert_rc(rc);
+ dprintk("%s: pan_sam_read rc %d: status %Zd\n",
+ __func__, rc, status);
+ }
+ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
+ objlayout_read_done(&state->ol_state, status, true);
+}
+
+ssize_t
+objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+ struct panfs_shim_io_state *state = container_of(ol_state,
+ struct panfs_shim_io_state, ol_state);
+ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
+ ssize_t status = 0;
+ pan_status_t rc = PAN_SUCCESS;
+
+ dprintk("%s: Begin\n", __func__);
+
+ status = panfs_shim_pages_to_sg(state, ol_state->pages,
+ ol_state->pgbase, ol_state->nr_pages,
+ ol_state->count);
+ if (unlikely(status))
+ goto err;
+
+ state->obj_sec.min_security = 0;
+ state->obj_sec.map_ccaps = mcs;
+
+ rc = panfs_export_ops->ucreds_get(&state->ucreds);
+ if (unlikely(rc)) {
+ status = -EACCES;
+ goto err;
+ }
+
+ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id;
+ state->u.read.args.offset = ol_state->offset;
+ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP,
+ &state->u.read.args,
+ &state->obj_sec,
+ state->sg_list,
+ state->ucreds,
+ ol_state->sync ?
+ NULL : panfs_shim_read_done,
+ state, NULL,
+ &state->u.read.res);
+ if (rc != PAN_ERR_IN_PROGRESS)
+ panfs_shim_read_done(state, NULL, &state->u.read.res, rc);
+ err:
+ dprintk("%s: Return %Zd\n", __func__, status);
+ return status;
+}
+
+/*
+ * Callback function for async writes
+ */
+static void
+panfs_shim_write_done(
+ void *arg1,
+ void *arg2,
+ pan_sam_write_res_t *res_p,
+ pan_status_t rc)
+{
+ struct panfs_shim_io_state *state = arg1;
+ ssize_t status;
+
+ dprintk("%s: Begin\n", __func__);
+ if (!res_p)
+ res_p = &state->u.write.res;
+ if (rc == PAN_SUCCESS)
+ rc = res_p->result;
+ if (rc == PAN_SUCCESS) {
+/* state->ol_state.committed = NFS_FILE_SYNC;*/
+ state->ol_state.committed = NFS_UNSTABLE;
+ status = res_p->length;
+ WARN_ON(status < 0);
+
+ objlayout_add_delta_space_used(&state->ol_state,
+ res_p->delta_capacity_used);
+ } else {
+ status = -panfs_export_ops->convert_rc(rc);
+ dprintk("%s: pan_sam_write rc %u: status %Zd\n",
+ __func__, rc, status);
+ }
+ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
+ objlayout_write_done(&state->ol_state, status, true);
+}
+
+ssize_t
+objio_write_pagelist(struct objlayout_io_state *ol_state,
+ bool stable /* unused, PanOSD writes are stable */)
+{
+ struct panfs_shim_io_state *state = container_of(ol_state,
+ struct panfs_shim_io_state, ol_state);
+ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
+ ssize_t status = 0;
+ pan_status_t rc = PAN_SUCCESS;
+
+ dprintk("%s: Begin\n", __func__);
+
+ status = panfs_shim_pages_to_sg(state, ol_state->pages,
+ ol_state->pgbase, ol_state->nr_pages,
+ ol_state->count);
+ if (unlikely(status))
+ goto err;
+
+ state->obj_sec.min_security = 0;
+ state->obj_sec.map_ccaps = mcs;
+
+ rc = panfs_export_ops->ucreds_get(&state->ucreds);
+ if (unlikely(rc)) {
+ status = -EACCES;
+ goto err;
+ }
+
+ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id;
+ state->u.write.args.offset = ol_state->offset;
+ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE,
+ &state->u.write.args,
+ &state->obj_sec,
+ state->sg_list,
+ state->ucreds,
+ ol_state->sync ?
+ NULL : panfs_shim_write_done,
+ state,
+ NULL,
+ &state->u.write.res);
+ if (rc != PAN_ERR_IN_PROGRESS)
+ panfs_shim_write_done(state, NULL, &state->u.write.res, rc);
+ err:
+ dprintk("%s: Return %Zd\n", __func__, status);
+ return status;
+}
+
+int
+panfs_shim_register(struct panfs_export_operations *ops)
+{
+ if (panfs_export_ops) {
+ printk(KERN_INFO
+ "%s: panfs already registered (panfs ops %p)\n",
+ __func__, panfs_export_ops);
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "%s: registering panfs ops %p\n",
+ __func__, ops);
+
+ panfs_export_ops = ops;
+ return 0;
+}
+EXPORT_SYMBOL(panfs_shim_register);
+
+int
+panfs_shim_unregister(void)
+{
+ if (!panfs_export_ops) {
+ printk(KERN_INFO "%s: panfs is not registered\n", __func__);
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "%s: unregistering panfs ops %p\n",
+ __func__, panfs_export_ops);
+
+ panfs_export_ops = NULL;
+ return 0;
+}
+EXPORT_SYMBOL(panfs_shim_unregister);
+
+/*
+ * Policy Operations
+ */
+
+#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024)
+#define PANLAYOUT_DEF_STRIPE_WIDTH 9
+#define PANLAYOUT_MAX_STRIPE_WIDTH 11
+#define PANLAYOUT_MAX_GATHER_STRIPES 8
+
+/*
+ * Get the max [rw]size
+ */
+static ssize_t
+panlayout_get_blocksize(void)
+{
+ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) *
+ PANLAYOUT_DEF_STRIPE_UNIT *
+ PANLAYOUT_MAX_GATHER_STRIPES;
+ dprintk("%s: Return %Zd\n", __func__, sz);
+ return sz;
+}
+
+/*
+ * Don't gather across stripes, but rather gather (coalesce) up to
+ * the stripe size.
+ *
+ * FIXME: change interface to use merge_align, merge_count
+ */
+#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS)
+
+static struct pnfs_layoutdriver_type panlayout_type = {
+ .id = PNFS_LAYOUT_PANOSD,
+ .name = "PNFS_LAYOUT_PANOSD",
+ .flags = PNFS_LAYOUTRET_ON_SETATTR,
+
+ .set_layoutdriver = objlayout_set_layoutdriver,
+ .clear_layoutdriver = objlayout_clear_layoutdriver,
+
+ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
+ .free_layout_hdr = objlayout_free_layout_hdr,
+
+ .alloc_lseg = objlayout_alloc_lseg,
+ .free_lseg = objlayout_free_lseg,
+
+ .get_blocksize = panlayout_get_blocksize,
+
+ .read_pagelist = objlayout_read_pagelist,
+ .write_pagelist = objlayout_write_pagelist,
+ .commit = objlayout_commit,
+
+ .encode_layoutcommit = objlayout_encode_layoutcommit,
+ .encode_layoutreturn = objlayout_encode_layoutreturn,
+};
+
+MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+
+static int __init
+panlayout_init(void)
+{
+ int ret = pnfs_register_layoutdriver(&panlayout_type);
+
+ if (ret)
+ printk(KERN_INFO
+ "%s: Registering Panasas OSD pNFS Layout Driver failed: error=%d\n",
+ __func__, ret);
+ else
+ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n",
+ __func__);
+ return ret;
+}
+
+static void __exit
+panlayout_exit(void)
+{
+ pnfs_unregister_layoutdriver(&panlayout_type);
+ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n",
+ __func__);
+}
+
+module_init(panlayout_init);
+module_exit(panlayout_exit);
diff -up linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.h
--- linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2011-03-26 07:57:44.267821370 -0400
+++ linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.h 2011-03-26 07:57:44.267821370 -0400
@@ -0,0 +1,482 @@
+/*
+ * panfs_shim.h
+ *
+ * Data types and external function declerations for interfacing with
+ * panfs (Panasas DirectFlow) I/O stack
+ *
+ * Copyright (C) 2007 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * See the file COPYING included with this distribution for more details.
+ *
+ */
+
+#ifndef _PANLAYOUT_PANFS_SHIM_H
+#define _PANLAYOUT_PANFS_SHIM_H
+
+typedef s8 pan_int8_t;
+typedef u8 pan_uint8_t;
+typedef s16 pan_int16_t;
+typedef u16 pan_uint16_t;
+typedef s32 pan_int32_t;
+typedef u32 pan_uint32_t;
+typedef s64 pan_int64_t;
+typedef u64 pan_uint64_t;
+
+/*
+ * from pan_base_types.h
+ */
+typedef pan_uint64_t pan_rpc_none_t;
+typedef pan_uint32_t pan_rpc_arrdim_t;
+typedef pan_uint32_t pan_status_t;
+typedef pan_uint8_t pan_otw_t;
+typedef pan_uint8_t pan_pad_t;
+
+typedef pan_uint32_t pan_timespec_sec_t;
+typedef pan_uint32_t pan_timespec_nsec_t;
+
+typedef struct pan_timespec_s pan_timespec_t;
+struct pan_timespec_s {
+ pan_timespec_sec_t ts_sec;
+ pan_timespec_nsec_t ts_nsec;
+};
+
+/*
+ * from pan_std_types.h
+ */
+typedef pan_uint32_t pan_size_t;
+typedef int pan_bool_t;
+
+/*
+ * from pan_common_error.h
+ */
+#define PAN_SUCCESS ((pan_status_t)0)
+#define PAN_ERR_IN_PROGRESS ((pan_status_t)55)
+
+/*
+ * from pan_sg.h
+ */
+typedef struct pan_sg_entry_s pan_sg_entry_t;
+struct pan_sg_entry_s {
+ void *buffer; /* pointer to memory */
+ pan_uint32_t chunk_size; /* size of each chunk (bytes) */
+ pan_sg_entry_t *next;
+};
+
+/*
+ * from pan_storage.h
+ */
+typedef pan_uint64_t pan_stor_dev_id_t;
+typedef pan_uint32_t pan_stor_obj_grp_id_t;
+typedef pan_uint64_t pan_stor_obj_uniq_t;
+typedef pan_uint32_t pan_stor_action_t;
+typedef pan_uint8_t pan_stor_cap_key_t[20];
+
+typedef pan_uint8_t pan_stor_key_type_t;
+typedef pan_uint64_t pan_stor_len_t;
+typedef pan_int64_t pan_stor_delta_len_t;
+typedef pan_uint64_t pan_stor_offset_t;
+typedef pan_uint16_t pan_stor_op_t;
+
+typedef pan_uint16_t pan_stor_sec_level_t;
+
+struct pan_stor_obj_id_s {
+ pan_stor_dev_id_t dev_id;
+ pan_stor_obj_uniq_t obj_id;
+ pan_stor_obj_grp_id_t grp_id;
+};
+
+typedef struct pan_stor_obj_id_s pan_stor_obj_id_t;
+
+#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U)
+#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U)
+#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U)
+#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U)
+#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U)
+#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U)
+#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U)
+#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U)
+
+/*
+ * from pan_aggregation_map.h
+ */
+typedef pan_uint8_t pan_agg_type_t;
+typedef pan_uint64_t pan_agg_map_version_t;
+typedef pan_uint8_t pan_agg_obj_state_t;
+typedef pan_uint8_t pan_agg_comp_state_t;
+typedef pan_uint8_t pan_agg_comp_flag_t;
+
+#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00)
+#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01)
+#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02)
+#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03)
+#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04)
+#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05)
+#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06)
+#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07)
+#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00)
+#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01)
+#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02)
+#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03)
+#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00)
+#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01)
+#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02)
+#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04)
+
+struct pan_aggregation_map_s {
+ pan_agg_map_version_t version;
+ pan_agg_obj_state_t avail_state;
+ pan_stor_obj_id_t obj_id;
+};
+
+typedef struct pan_aggregation_map_s pan_aggregation_map_t;
+
+struct pan_agg_comp_obj_s {
+ pan_stor_dev_id_t dev_id;
+ pan_agg_comp_state_t avail_state;
+ pan_agg_comp_flag_t comp_flags;
+};
+
+typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t;
+
+struct pan_agg_simple_header_s {
+ pan_uint8_t unused;
+};
+
+typedef struct pan_agg_simple_header_s pan_agg_simple_header_t;
+
+struct pan_agg_raid1_header_s {
+ pan_uint16_t num_comps;
+};
+
+typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t;
+
+struct pan_agg_raid0_header_s {
+ pan_uint16_t num_comps;
+ pan_uint32_t stripe_unit;
+};
+
+typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t;
+
+struct pan_agg_raid5_left_header_s {
+ pan_uint16_t num_comps;
+ pan_uint32_t stripe_unit0;
+ pan_uint32_t stripe_unit1;
+ pan_uint32_t stripe_unit2;
+};
+
+typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t;
+
+typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t;
+
+struct pan_agg_grp_raid5_left_header_s {
+ pan_uint16_t num_comps;
+ pan_uint32_t stripe_unit;
+ pan_uint16_t rg_width;
+ pan_uint16_t rg_depth;
+ pan_uint8_t group_layout_policy;
+};
+
+#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00)
+#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01)
+
+#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00)
+#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01)
+#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02)
+#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03)
+#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04)
+#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06)
+#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01)
+#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06)
+
+struct pan_agg_layout_hdr_s {
+ pan_agg_type_t type;
+ pan_pad_t pad[3];
+ union {
+ pan_uint64_t null;
+ pan_agg_simple_header_t simple;
+ pan_agg_raid1_header_t raid1;
+ pan_agg_raid0_header_t raid0;
+ pan_agg_raid5_left_header_t raid5_left;
+ pan_agg_grp_raid5_left_header_t grp_raid5_left;
+ } hdr;
+};
+
+typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t;
+
+struct pan_agg_comp_obj_a_s {
+ pan_rpc_arrdim_t size;
+ pan_agg_comp_obj_t *data;
+};
+typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a;
+
+struct pan_agg_full_map_s {
+ pan_aggregation_map_t map_hdr;
+ pan_agg_layout_hdr_t layout_hdr;
+ pan_agg_comp_obj_a components;
+};
+
+typedef struct pan_agg_full_map_s pan_agg_full_map_t;
+
+/*
+ * from pan_obsd_rpc_types.h
+ */
+typedef pan_uint8_t pan_obsd_security_key_a[16];
+
+typedef pan_uint8_t pan_obsd_capability_key_a[20];
+
+typedef pan_uint8_t pan_obsd_key_holder_id_t;
+
+#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01)
+#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02)
+
+struct pan_obsd_key_holder_s {
+ pan_obsd_key_holder_id_t select;
+ pan_pad_t pad[3];
+ union {
+ pan_obsd_security_key_a basis_key;
+ pan_obsd_capability_key_a cap_key;
+ } key;
+};
+
+typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t;
+
+/*
+ * from pan_sm_sec.h
+ */
+typedef pan_uint8_t pan_sm_sec_type_t;
+typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t;
+
+struct pan_obsd_capability_generic_otw_t_s {
+ pan_rpc_arrdim_t size;
+ pan_uint8_t *data;
+};
+typedef struct pan_obsd_capability_generic_otw_t_s
+ pan_obsd_capability_generic_otw_t;
+
+struct pan_sm_sec_obsd_s {
+ pan_obsd_key_holder_t key;
+ pan_obsd_capability_generic_otw_t cap_otw;
+ pan_sm_sec_otw_allo_mode_t allo_mode;
+};
+
+typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t;
+
+struct pan_sm_sec_s {
+ pan_sm_sec_type_t type;
+ pan_pad_t pad[3];
+ union {
+ pan_rpc_none_t none;
+ pan_sm_sec_obsd_t obsd;
+ } variant;
+};
+
+typedef struct pan_sm_sec_s pan_sm_sec_t;
+
+struct pan_sm_sec_a_s {
+ pan_rpc_arrdim_t size;
+ pan_sm_sec_t *data;
+};
+typedef struct pan_sm_sec_a_s pan_sm_sec_a;
+typedef pan_otw_t *pan_sm_sec_otw_t;
+
+/*
+ * from pan_sm_types.h
+ */
+typedef pan_uint64_t pan_sm_cap_handle_t;
+
+struct pan_sm_map_cap_s {
+ pan_agg_full_map_t full_map;
+ pan_stor_offset_t offset;
+ pan_stor_len_t length;
+ pan_sm_sec_a secs;
+ pan_sm_cap_handle_t handle;
+ pan_timespec_t expiration_time;
+ pan_stor_action_t action_mask;
+ pan_uint32_t flags;
+};
+
+typedef struct pan_sm_map_cap_s pan_sm_map_cap_t;
+
+/*
+ * from pan_sm_ops.h
+ */
+typedef pan_rpc_none_t pan_sm_cache_ptr_t;
+
+/*
+ * from pan_sam_api.h
+ */
+typedef pan_uint32_t pan_sam_access_flags_t;
+
+typedef struct pan_sam_dev_error_s pan_sam_dev_error_t;
+struct pan_sam_dev_error_s {
+ pan_stor_dev_id_t dev_id;
+ pan_stor_op_t stor_op;
+ pan_status_t error;
+};
+
+typedef struct pan_sam_ext_status_s pan_sam_ext_status_t;
+struct pan_sam_ext_status_s {
+ pan_uint32_t available;
+ pan_uint32_t size;
+ pan_sam_dev_error_t *errors;
+};
+
+enum pan_sam_rpc_sec_sel_e {
+ PAN_SAM_RPC_SEC_DEFAULT,
+ PAN_SAM_RPC_SEC_ATLEAST,
+ PAN_SAM_RPC_SEC_EXACTLY
+};
+typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t;
+
+typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t;
+struct pan_sam_obj_sec_s {
+ pan_stor_sec_level_t min_security;
+ pan_sm_map_cap_t *map_ccaps;
+};
+
+typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t;
+struct pan_sam_rpc_sec_s {
+ pan_sam_rpc_sec_sel_t selector;
+};
+
+typedef struct pan_sam_read_args_s pan_sam_read_args_t;
+struct pan_sam_read_args_s {
+ pan_stor_obj_id_t obj_id;
+ pan_sm_cache_ptr_t obj_ent;
+ void *return_attr;
+ void *checksum;
+ pan_stor_offset_t offset;
+ pan_uint16_t sm_options;
+ void *callout;
+ void *callout_arg;
+};
+
+typedef struct pan_sam_read_res_s pan_sam_read_res_t;
+struct pan_sam_read_res_s {
+ pan_status_t result;
+ pan_sam_ext_status_t ext_status;
+ pan_stor_len_t length;
+ void *attr;
+ void *checksum;
+};
+
+typedef void (*pan_sam_read_cb_t)(
+ void *user_arg1,
+ void *user_arg2,
+ pan_sam_read_res_t *res_p,
+ pan_status_t status);
+
+#define PAN_SAM_ACCESS_NONE 0x0000
+#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020
+
+typedef struct pan_sam_write_args_s pan_sam_write_args_t;
+struct pan_sam_write_args_s {
+ pan_stor_obj_id_t obj_id;
+ pan_sm_cache_ptr_t obj_ent;
+ pan_stor_offset_t offset;
+ void *attr;
+ void *return_attr;
+};
+
+typedef struct pan_sam_write_res_s pan_sam_write_res_t;
+struct pan_sam_write_res_s {
+ pan_status_t result;
+ pan_sam_ext_status_t ext_status;
+ pan_stor_len_t length;
+ pan_stor_delta_len_t delta_capacity_used;
+ pan_bool_t parity_dirty;
+ void *attr;
+};
+
+typedef void (*pan_sam_write_cb_t)(
+ void *user_arg1,
+ void *user_arg2,
+ pan_sam_write_res_t *res_p,
+ pan_status_t status);
+
+/*
+ * from pan_mgr_types.h
+ */
+#define PAN_MGR_ID_TYPE_SHIFT 56
+#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL)
+#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL)
+
+typedef pan_uint16_t pan_mgr_type_t;
+typedef pan_uint64_t pan_mgr_id_t;
+
+#define PAN_MGR_SM ((pan_mgr_type_t) 2U)
+#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U)
+
+/*
+ * from pan_mgr_types_c.h
+ */
+#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \
+ pan_mgr_id_t _id1, _id2; \
+\
+ _id1 = (_mgr_type_); \
+ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \
+ _id1 &= PAN_MGR_ID_TYPE_MASK; \
+ _id2 = (_mgr_uniq_); \
+ _id2 &= PAN_MGR_ID_UNIQ_MASK; \
+ _id1 |= _id2; \
+ *(_mgr_id_p_) = _id1; \
+}
+
+/*
+ * from pan_storage_c.h
+ */
+#define pan_stor_is_device_id_an_obsd_id(_device_id_) \
+ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \
+ == PAN_MGR_OBSD)
+
+/*
+ * pnfs_shim internal definitions
+ */
+
+struct panfs_shim_io_state {
+ struct objlayout_io_state ol_state;
+
+ pan_sg_entry_t *sg_list;
+ pan_sam_obj_sec_t obj_sec;
+ void *ucreds;
+ union {
+ struct {
+ pan_sam_read_args_t args;
+ pan_sam_read_res_t res;
+ } read;
+ struct {
+ pan_sam_write_args_t args;
+ pan_sam_write_res_t res;
+ } write;
+ } u;
+};
+
+#endif /* _PANLAYOUT_PANFS_SHIM_H */
diff -up linux-2.6.38.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.38.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
--- linux-2.6.38.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2011-03-26 07:57:44.268821362 -0400
+++ linux-2.6.38.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2011-03-26 07:57:44.268821362 -0400
@@ -0,0 +1,435 @@
+/*
+ * pnfs_osd_xdr.c
+ *
+ * Object-Based pNFS Layout XDR layer
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/pnfs_osd_xdr.h>
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/*
+ * The following implementation is based on these Internet Drafts:
+ *
+ * draft-ietf-nfsv4-minorversion-21
+ * draft-ietf-nfsv4-pnfs-obj-12
+ */
+
+/*
+ * struct pnfs_osd_objid {
+ * struct pnfs_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ * };
+ */
+static inline u32 *
+pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid)
+{
+ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data));
+ READ64(objid->oid_partition_id);
+ READ64(objid->oid_object_id);
+ return p;
+}
+
+static inline u32 *
+pnfs_osd_xdr_decode_opaque_cred(u32 *p,
+ struct pnfs_osd_opaque_cred *opaque_cred)
+{
+ READ32(opaque_cred->cred_len);
+ COPYMEM(opaque_cred->cred, opaque_cred->cred_len);
+ return p;
+}
+
+/*
+ * struct pnfs_osd_object_cred {
+ * struct pnfs_osd_objid oc_object_id;
+ * u32 oc_osd_version;
+ * u32 oc_cap_key_sec;
+ * struct pnfs_osd_opaque_cred oc_cap_key
+ * struct pnfs_osd_opaque_cred oc_cap;
+ * };
+ */
+static inline u32 *
+pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp,
+ u8 **credp)
+{
+ u8 *cred;
+
+ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id);
+ READ32(comp->oc_osd_version);
+ READ32(comp->oc_cap_key_sec);
+
+ cred = *credp;
+ comp->oc_cap_key.cred = cred;
+ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key);
+ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len));
+ comp->oc_cap.cred = cred;
+ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap);
+ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len));
+ *credp = cred;
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_data_map {
+ * u32 odm_num_comps;
+ * u64 odm_stripe_unit;
+ * u32 odm_group_width;
+ * u32 odm_group_depth;
+ * u32 odm_mirror_cnt;
+ * u32 odm_raid_algorithm;
+ * };
+ */
+static inline u32 *
+pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map)
+{
+ READ32(data_map->odm_num_comps);
+ READ64(data_map->odm_stripe_unit);
+ READ32(data_map->odm_group_width);
+ READ32(data_map->odm_group_depth);
+ READ32(data_map->odm_mirror_cnt);
+ READ32(data_map->odm_raid_algorithm);
+ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+ __func__,
+ data_map->odm_num_comps,
+ (unsigned long long)data_map->odm_stripe_unit,
+ data_map->odm_group_width,
+ data_map->odm_group_depth,
+ data_map->odm_mirror_cnt,
+ data_map->odm_raid_algorithm);
+ return p;
+}
+
+struct pnfs_osd_layout *
+pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p)
+{
+ int i;
+ u32 *start = p;
+ struct pnfs_osd_object_cred *comp;
+ u8 *cred;
+
+ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map);
+ READ32(layout->olo_comps_index);
+ READ32(layout->olo_num_comps);
+ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1);
+ comp = layout->olo_comps;
+ cred = (u8 *)(comp + layout->olo_num_comps);
+ dprintk("%s: comps_index=%u num_comps=%u\n",
+ __func__, layout->olo_comps_index, layout->olo_num_comps);
+ for (i = 0; i < layout->olo_num_comps; i++) {
+ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred);
+ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "key_len=%u cap_len=%u\n",
+ __func__, i,
+ _DEVID_LO(&comp->oc_object_id.oid_device_id),
+ _DEVID_HI(&comp->oc_object_id.oid_device_id),
+ comp->oc_object_id.oid_partition_id,
+ comp->oc_object_id.oid_object_id,
+ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+ comp++;
+ }
+ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__,
+ (char *)p - (char *)start, cred, (char *)cred - (char *)layout);
+ return layout;
+}
+
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, most
+ * of the actual fields are left inside the rpc buffer and are only
+ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ * should not be freed while the returned information is in use.
+ */
+
+u32 *__xdr_read_calc_nfs4_string(
+ u32 *p, struct nfs4_string *str, u8 **freespace)
+{
+ u32 len;
+ char *data;
+ bool need_copy;
+
+ READ32(len);
+ data = (char *)p;
+
+ if (data[len]) { /* Not null terminated we'll need extra space */
+ data = *freespace;
+ *freespace += len + 1;
+ need_copy = true;
+ } else {
+ need_copy = false;
+ }
+
+ if (str) {
+ str->len = len;
+ str->data = data;
+ if (need_copy) {
+ memcpy(data, p, len);
+ data[len] = 0;
+ }
+ }
+
+ p += XDR_QUADLEN(len);
+ return p;
+}
+
+u32 *__xdr_read_calc_u8_opaque(
+ u32 *p, struct nfs4_string *str)
+{
+ u32 len;
+
+ READ32(len);
+
+ if (str) {
+ str->len = len;
+ str->data = (char *)p;
+ }
+
+ p += XDR_QUADLEN(len);
+ return p;
+}
+
+/*
+ * struct pnfs_osd_targetid {
+ * u32 oti_type;
+ * struct nfs4_string oti_scsi_device_id;
+ * };
+ */
+u32 *__xdr_read_calc_targetid(
+ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace)
+{
+ u32 oti_type;
+
+ READ32(oti_type);
+ if (targetid)
+ targetid->oti_type = oti_type;
+
+ switch (oti_type) {
+ case OBJ_TARGET_SCSI_NAME:
+ case OBJ_TARGET_SCSI_DEVICE_ID:
+ p = __xdr_read_calc_u8_opaque(p,
+ targetid ? &targetid->oti_scsi_device_id : NULL);
+ }
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_net_addr {
+ * struct nfs4_string r_netid;
+ * struct nfs4_string r_addr;
+ * };
+ */
+u32 *__xdr_read_calc_net_addr(
+ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace)
+{
+
+ p = __xdr_read_calc_nfs4_string(p,
+ netaddr ? &netaddr->r_netid : NULL,
+ freespace);
+
+ p = __xdr_read_calc_nfs4_string(p,
+ netaddr ? &netaddr->r_addr : NULL,
+ freespace);
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_targetaddr {
+ * u32 ota_available;
+ * struct pnfs_osd_net_addr ota_netaddr;
+ * };
+ */
+u32 *__xdr_read_calc_targetaddr(
+ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace)
+{
+ u32 ota_available;
+
+ READ32(ota_available);
+ if (targetaddr)
+ targetaddr->ota_available = ota_available;
+
+ if (ota_available) {
+ p = __xdr_read_calc_net_addr(p,
+ targetaddr ? &targetaddr->ota_netaddr : NULL,
+ freespace);
+ }
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_deviceaddr {
+ * struct pnfs_osd_targetid oda_targetid;
+ * struct pnfs_osd_targetaddr oda_targetaddr;
+ * u8 oda_lun[8];
+ * struct nfs4_string oda_systemid;
+ * struct pnfs_osd_object_cred oda_root_obj_cred;
+ * struct nfs4_string oda_osdname;
+ * };
+ */
+u32 *__xdr_read_calc_deviceaddr(
+ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace)
+{
+ p = __xdr_read_calc_targetid(p,
+ deviceaddr ? &deviceaddr->oda_targetid : NULL,
+ freespace);
+
+ p = __xdr_read_calc_targetaddr(p,
+ deviceaddr ? &deviceaddr->oda_targetaddr : NULL,
+ freespace);
+
+ if (deviceaddr)
+ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun));
+ else
+ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun));
+
+ p = __xdr_read_calc_u8_opaque(p,
+ deviceaddr ? &deviceaddr->oda_systemid : NULL);
+
+ if (deviceaddr) {
+ p = pnfs_osd_xdr_decode_object_cred(p,
+ &deviceaddr->oda_root_obj_cred, freespace);
+ } else {
+ *freespace += pnfs_osd_object_cred_incore_sz(p);
+ p += pnfs_osd_object_cred_xdr_sz(p);
+ }
+
+ p = __xdr_read_calc_u8_opaque(p,
+ deviceaddr ? &deviceaddr->oda_osdname : NULL);
+
+ return p;
+}
+
+size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p)
+{
+ u8 *null_freespace = NULL;
+ size_t sz;
+
+ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace);
+ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace;
+
+ return sz;
+}
+
+void pnfs_osd_xdr_decode_deviceaddr(
+ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p)
+{
+ u8 *freespace = (u8 *)(deviceaddr + 1);
+
+ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace);
+}
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ * u32 dsu_valid;
+ * s64 dsu_delta;
+ * u32 olu_ioerr_flag;
+ * };
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+ struct pnfs_osd_layoutupdate *lou)
+{
+ __be32 *p = xdr_reserve_space(xdr, 16);
+
+ if (!p)
+ return -E2BIG;
+
+ *p++ = cpu_to_be32(lou->dsu_valid);
+ if (lou->dsu_valid)
+ p = xdr_encode_hyper(p, lou->dsu_delta);
+ *p++ = cpu_to_be32(lou->olu_ioerr_flag);
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ * struct pnfs_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ */
+static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr,
+ struct pnfs_osd_objid *object_id)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 32);
+ if (!p)
+ return -E2BIG;
+
+ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+ sizeof(object_id->oid_device_id.data));
+ p = xdr_encode_hyper(p, object_id->oid_partition_id);
+ p = xdr_encode_hyper(p, object_id->oid_object_id);
+
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ * struct pnfs_osd_objid oer_component;
+ * u64 oer_comp_offset;
+ * u64 oer_comp_length;
+ * u32 oer_iswrite;
+ * u32 oer_errno;
+ * };
+ */
+int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
+ struct pnfs_osd_ioerr *ioerr)
+{
+ __be32 *p;
+ int ret;
+
+ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component);
+ if (ret)
+ return ret;
+
+ p = xdr_reserve_space(xdr, 24);
+ if (!p)
+ return -E2BIG;
+
+ p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+ p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+ *p++ = cpu_to_be32(ioerr->oer_iswrite);
+ *p = cpu_to_be32(ioerr->oer_errno);
+
+ return 0;
+}
diff -up linux-2.6.38.noarch/fs/nfs/pagelist.c.orig linux-2.6.38.noarch/fs/nfs/pagelist.c
--- linux-2.6.38.noarch/fs/nfs/pagelist.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/pagelist.c 2011-03-26 07:57:44.269821354 -0400
@@ -20,6 +20,7 @@
#include <linux/nfs_mount.h>
#include "internal.h"
+#include "pnfs.h"
static struct kmem_cache *nfs_page_cachep;
@@ -53,7 +54,8 @@ nfs_page_free(struct nfs_page *p)
struct nfs_page *
nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
struct page *page,
- unsigned int offset, unsigned int count)
+ unsigned int offset, unsigned int count,
+ struct pnfs_layout_segment *lseg)
{
struct nfs_page *req;
@@ -84,6 +86,9 @@ nfs_create_request(struct nfs_open_conte
req->wb_bytes = count;
req->wb_context = get_nfs_open_context(ctx);
kref_init(&req->wb_kref);
+ req->wb_lseg = lseg;
+ if (lseg)
+ get_lseg(lseg);
return req;
}
@@ -159,9 +164,12 @@ void nfs_clear_request(struct nfs_page *
put_nfs_open_context(ctx);
req->wb_context = NULL;
}
+ if (req->wb_lseg != NULL) {
+ put_lseg(req->wb_lseg);
+ req->wb_lseg = NULL;
+ }
}
-
/**
* nfs_release_request - Release the count on an NFS read/write request
* @req: request to release
@@ -240,7 +248,8 @@ void nfs_pageio_init(struct nfs_pageio_d
* Return 'true' if this is the case, else return 'false'.
*/
static int nfs_can_coalesce_requests(struct nfs_page *prev,
- struct nfs_page *req)
+ struct nfs_page *req,
+ struct nfs_pageio_descriptor *pgio)
{
if (req->wb_context->cred != prev->wb_context->cred)
return 0;
@@ -254,6 +263,12 @@ static int nfs_can_coalesce_requests(str
return 0;
if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
return 0;
+ if (req->wb_lseg != prev->wb_lseg)
+ return 0;
+#ifdef CONFIG_NFS_V4_1
+ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
+ return 0;
+#endif /* CONFIG_NFS_V4_1 */
return 1;
}
@@ -286,7 +301,7 @@ static int nfs_pageio_do_add_request(str
if (newlen > desc->pg_bsize)
return 0;
prev = nfs_list_entry(desc->pg_list.prev);
- if (!nfs_can_coalesce_requests(prev, req))
+ if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
} else
desc->pg_base = req->wb_pgbase;
@@ -375,6 +390,7 @@ void nfs_pageio_cond_complete(struct nfs
* @idx_start: lower bound of page->index to scan
* @npages: idx_start + npages sets the upper bound to scan.
* @tag: tag to scan for
+ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver
*
* Moves elements from one of the inode request lists.
* If the number of requests is set to 0, the entire address_space
@@ -384,7 +400,7 @@ void nfs_pageio_cond_complete(struct nfs
*/
int nfs_scan_list(struct nfs_inode *nfsi,
struct list_head *dst, pgoff_t idx_start,
- unsigned int npages, int tag)
+ unsigned int npages, int tag, int *use_pnfs)
{
struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
struct nfs_page *req;
@@ -415,6 +431,8 @@ int nfs_scan_list(struct nfs_inode *nfsi
radix_tree_tag_clear(&nfsi->nfs_page_tree,
req->wb_index, tag);
nfs_list_add_request(req, dst);
+ if (req->wb_lseg)
+ *use_pnfs = 1;
res++;
if (res == INT_MAX)
goto out;
diff -up linux-2.6.38.noarch/fs/nfs/pnfs.c.orig linux-2.6.38.noarch/fs/nfs/pnfs.c
--- linux-2.6.38.noarch/fs/nfs/pnfs.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/pnfs.c 2011-03-26 07:57:44.271821338 -0400
@@ -30,6 +30,7 @@
#include <linux/nfs_fs.h>
#include "internal.h"
#include "pnfs.h"
+#include "iostat.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -71,6 +72,52 @@ find_pnfs_driver(u32 id)
return local;
}
+/* Set cred to indicate we require a layoutcommit
+ * If we don't even have a layout, we don't need to commit it.
+ */
+void
+pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx)
+{
+ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx);
+ spin_lock(&nfsi->vfs_inode.i_lock);
+ if (has_layout(nfsi) &&
+ !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags)) {
+ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred);
+ __set_bit(NFS_LAYOUT_NEED_LCOMMIT,
+ &nfsi->layout->plh_flags);
+ nfsi->change_attr++;
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+ dprintk("%s: Set layoutcommit\n", __func__);
+ return;
+ }
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+}
+
+/* Update last_write_offset for layoutcommit.
+ * TODO: We should only use commited extents, but the current nfs
+ * implementation does not calculate the written range in nfs_commit_done.
+ * We therefore update this field in writeback_done.
+ */
+void
+pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent)
+{
+ loff_t end_pos;
+
+ spin_lock(&nfsi->vfs_inode.i_lock);
+ if (offset < nfsi->layout->write_begin_pos)
+ nfsi->layout->write_begin_pos = offset;
+ end_pos = offset + extent - 1; /* I'm being inclusive */
+ if (end_pos > nfsi->layout->write_end_pos)
+ nfsi->layout->write_end_pos = end_pos;
+ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n",
+ __func__,
+ (unsigned long) extent,
+ (unsigned long) offset ,
+ (unsigned long) nfsi->layout->write_begin_pos,
+ (unsigned long) nfsi->layout->write_end_pos);
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+}
+
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
@@ -88,7 +135,8 @@ unset_pnfs_layoutdriver(struct nfs_serve
* @id layout type. Zero (illegal layout type) indicates pNFS not in use.
*/
void
-set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
+ u32 id)
{
struct pnfs_layoutdriver_type *ld_type = NULL;
@@ -115,7 +163,7 @@ set_pnfs_layoutdriver(struct nfs_server
goto out_no_driver;
}
server->pnfs_curr_ld = ld_type;
- if (ld_type->set_layoutdriver(server)) {
+ if (ld_type->set_layoutdriver(server, mntfh)) {
printk(KERN_ERR
"%s: Error initializing mount point for layout driver %u.\n",
__func__, id);
@@ -146,6 +194,14 @@ pnfs_register_layoutdriver(struct pnfs_l
return status;
}
+ if (!ld_type->read_pagelist || !ld_type->write_pagelist ||
+ !ld_type->commit) {
+ printk(KERN_ERR "%s Layout driver must provide "
+ "read_pagelist, write_pagelist, and commit.\n",
+ __func__);
+ return status;
+ }
+
spin_lock(&pnfs_spinlock);
tmp = find_pnfs_driver_locked(ld_type->id);
if (!tmp) {
@@ -184,18 +240,35 @@ get_layout_hdr(struct pnfs_layout_hdr *l
atomic_inc(&lo->plh_refcount);
}
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) :
+ kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+}
+
static void
destroy_layout_hdr(struct pnfs_layout_hdr *lo)
{
dprintk("%s: freeing layout cache %p\n", __func__, lo);
BUG_ON(!list_empty(&lo->plh_layouts));
NFS_I(lo->plh_inode)->layout = NULL;
- kfree(lo);
+ pnfs_free_layout_hdr(lo);
}
static void
put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
{
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ BUG_ON(atomic_read(&lo->plh_refcount) == 0);
if (atomic_dec_and_test(&lo->plh_refcount))
destroy_layout_hdr(lo);
}
@@ -205,6 +278,7 @@ put_layout_hdr(struct pnfs_layout_hdr *l
{
struct inode *inode = lo->plh_inode;
+ BUG_ON(atomic_read(&lo->plh_refcount) == 0);
if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
destroy_layout_hdr(lo);
spin_unlock(&inode->i_lock);
@@ -225,64 +299,136 @@ static void free_lseg(struct pnfs_layout
{
struct inode *ino = lseg->pls_layout->plh_inode;
+ BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
/* Matched by get_layout_hdr in pnfs_insert_layout */
put_layout_hdr(NFS_I(ino)->layout);
}
-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
- * could sleep, so must be called outside of the lock.
- * Returns 1 if object was removed, otherwise return 0.
- */
-static int
-put_lseg_locked(struct pnfs_layout_segment *lseg,
- struct list_head *tmp_list)
+static void
+put_lseg_common(struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = lseg->pls_layout->plh_inode;
+
+ BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ list_del_init(&lseg->pls_list);
+ if (list_empty(&lseg->pls_layout->plh_segs)) {
+ set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+ /* Matched by initial refcount set in alloc_init_layout_hdr */
+ put_layout_hdr_locked(lseg->pls_layout);
+ }
+ rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+void
+put_lseg(struct pnfs_layout_segment *lseg)
{
+ struct inode *ino;
+
+ if (!lseg)
+ return;
+
dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
atomic_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- if (atomic_dec_and_test(&lseg->pls_refcount)) {
- struct inode *ino = lseg->pls_layout->plh_inode;
+ ino = lseg->pls_layout->plh_inode;
+ if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
+ put_lseg_common(lseg);
+ spin_unlock(&ino->i_lock);
+ free_lseg(lseg);
+ }
+}
+EXPORT_SYMBOL_GPL(put_lseg);
- BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- list_del(&lseg->pls_list);
- if (list_empty(&lseg->pls_layout->plh_segs)) {
- struct nfs_client *clp;
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
- clp = NFS_SERVER(ino)->nfs_client;
- spin_lock(&clp->cl_lock);
- /* List does not take a reference, so no need for put here */
- list_del_init(&lseg->pls_layout->plh_layouts);
- spin_unlock(&clp->cl_lock);
- clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
- }
- rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
- list_add(&lseg->pls_list, tmp_list);
- return 1;
- }
- return 0;
+ end = start + len;
+ return end >= start ? end: NFS4_MAX_UINT64;
}
-static bool
-should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
{
- return (recall_iomode == IOMODE_ANY ||
- lseg_iomode == recall_iomode);
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1: NFS4_MAX_UINT64;
}
-/* Returns 1 if lseg is removed from list, 0 otherwise */
-static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
- struct list_head *tmp_list)
+/*
+ * is l2 fully contained in l1?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline int
+lo_seg_contained(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ return (start1 <= start2) && (end1 >= end2);
+}
+
+/*
+ * is l1 and l2 intersecting?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline int
+lo_seg_intersecting(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
{
- int rv = 0;
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+ (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
- if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+bool
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+ struct pnfs_layout_range *recall_range)
+{
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ lo_seg_intersecting(lseg_range, recall_range);
+}
+
+static bool mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
+{
+ bool rv;
+
+ assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
+ rv = test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+ if (rv) {
/* Remove the reference keeping the lseg in the
* list. It will now be removed when all
* outstanding io is finished.
*/
- rv = put_lseg_locked(lseg, tmp_list);
+ dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+ atomic_read(&lseg->pls_refcount));
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ put_lseg_common(lseg);
+ list_add(&lseg->pls_list, tmp_list);
+ rv = true;
+ }
}
+
return rv;
}
@@ -292,18 +438,24 @@ static int mark_lseg_invalid(struct pnfs
int
mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- u32 iomode)
+ struct pnfs_layout_range *recall_range)
{
struct pnfs_layout_segment *lseg, *next;
int invalid = 0, removed = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
+ if (list_empty(&lo->plh_segs)) {
+ if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+ put_layout_hdr_locked(lo);
+ return 0;
+ }
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+ if (should_free_lseg(&lseg->pls_range, recall_range)) {
dprintk("%s: freeing lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
- lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+ lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset,
lseg->pls_range.length);
invalid++;
removed += mark_lseg_invalid(lseg, tmp_list);
@@ -312,11 +464,57 @@ mark_matching_lsegs_invalid(struct pnfs_
return invalid - removed;
}
+/* Returns false if there was nothing to do, true otherwise */
+static bool
+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
+ struct pnfs_layout_range *range)
+{
+ struct pnfs_layout_segment *lseg, *next;
+ bool rv = false;
+
+ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
+ __func__, lo, range->offset, range->length, range->iomode);
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ if (list_empty(&lo->plh_segs)) {
+ if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+ put_layout_hdr_locked(lo);
+ return 0;
+ }
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (should_free_lseg(&lseg->pls_range, range)) {
+ dprintk("%s: freeing lseg %p iomode %d "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset,
+ lseg->pls_range.length);
+ mark_lseg_invalid(lseg, tmp_list);
+ rv = true;
+ }
+ dprintk("%s:Return %d\n", __func__, rv);
+ return rv;
+}
+
+/* note free_me must contain lsegs from a single layout_hdr */
void
pnfs_free_lseg_list(struct list_head *free_me)
{
struct pnfs_layout_segment *lseg, *tmp;
+ struct pnfs_layout_hdr *lo;
+
+ if (list_empty(free_me))
+ return;
+
+ lo = list_first_entry(free_me, struct pnfs_layout_segment,
+ pls_list)->pls_layout;
+ if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+ struct nfs_client *clp;
+
+ clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
list_del(&lseg->pls_list);
free_lseg(lseg);
@@ -328,14 +526,17 @@ pnfs_destroy_layout(struct nfs_inode *nf
{
struct pnfs_layout_hdr *lo;
LIST_HEAD(tmp_list);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
- set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
- mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
- /* Matched by refcount set to 1 in alloc_init_layout_hdr */
- put_layout_hdr_locked(lo);
+ lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
+ mark_matching_lsegs_invalid(lo, &tmp_list, &range);
}
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -371,16 +572,14 @@ pnfs_set_layout_stateid(struct pnfs_layo
{
u32 oldseq, newseq;
+ assert_spin_locked(&lo->plh_inode->i_lock);
oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
newseq = be32_to_cpu(new->stateid.seqid);
if ((int)(newseq - oldseq) > 0) {
memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
- if (update_barrier) {
- u32 new_barrier = be32_to_cpu(new->stateid.seqid);
-
- if ((int)(new_barrier - lo->plh_barrier))
- lo->plh_barrier = new_barrier;
- } else {
+ if (update_barrier)
+ lo->plh_barrier = be32_to_cpu(new->stateid.seqid);
+ else {
/* Because of wraparound, we want to keep the barrier
* "close" to the current seqids. It needs to be
* within 2**31 to count as "behind", so if it
@@ -403,6 +602,7 @@ pnfs_layoutgets_blocked(struct pnfs_layo
(int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
return true;
return lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
(list_empty(&lo->plh_segs) &&
(atomic_read(&lo->plh_outstanding) > lget));
@@ -429,7 +629,7 @@ pnfs_choose_layoutget_stateid(nfs4_state
} else
memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
spin_unlock(&lo->plh_inode->i_lock);
- dprintk("<-- %s\n", __func__);
+ dprintk("<-- %s status=%d\n", __func__, status);
return status;
}
@@ -442,7 +642,7 @@ pnfs_choose_layoutget_stateid(nfs4_state
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
- u32 iomode)
+ struct pnfs_layout_range *range)
{
struct inode *ino = lo->plh_inode;
struct nfs_server *server = NFS_SERVER(ino);
@@ -455,11 +655,11 @@ send_layoutget(struct pnfs_layout_hdr *l
lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
if (lgp == NULL)
return NULL;
- lgp->args.minlength = NFS4_MAX_UINT64;
+ lgp->args.minlength = PAGE_CACHE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- lgp->args.range.iomode = iomode;
- lgp->args.range.offset = 0;
- lgp->args.range.length = NFS4_MAX_UINT64;
+ lgp->args.range = *range;
lgp->args.type = server->pnfs_curr_ld->id;
lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx);
@@ -471,11 +671,82 @@ send_layoutget(struct pnfs_layout_hdr *l
nfs4_proc_layoutget(lgp);
if (!lseg) {
/* remember that LAYOUTGET failed and suspend trying */
- set_bit(lo_fail_bit(iomode), &lo->plh_flags);
+ set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
}
return lseg;
}
+static int
+return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
+{
+ struct nfs4_layoutreturn *lrp;
+ struct nfs_server *server = NFS_SERVER(ino);
+ int status = -ENOMEM;
+
+ dprintk("--> %s\n", __func__);
+
+ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+ if (lrp == NULL) {
+ put_layout_hdr(NFS_I(ino)->layout);
+ goto out;
+ }
+ lrp->args.reclaim = 0;
+ lrp->args.layout_type = server->pnfs_curr_ld->id;
+ lrp->args.return_type = RETURN_FILE;
+ lrp->args.range = *range;
+ lrp->args.inode = ino;
+ lrp->clp = server->nfs_client;
+
+ status = nfs4_proc_layoutreturn(lrp, wait);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
+/* Initiates a LAYOUTRETURN(FILE) */
+int
+_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
+ bool wait)
+{
+ struct pnfs_layout_hdr *lo = NULL;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_range arg;
+ LIST_HEAD(tmp_list);
+ int status = 0;
+
+ dprintk("--> %s\n", __func__);
+
+ arg.iomode = range ? range->iomode : IOMODE_ANY;
+ arg.offset = 0;
+ arg.length = NFS4_MAX_UINT64;
+
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) {
+ spin_unlock(&ino->i_lock);
+ dprintk("%s: no layout segments to return\n", __func__);
+ goto out;
+ }
+ /* Reference matched in nfs4_layoutreturn_release */
+ get_layout_hdr(lo);
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);
+
+ if (layoutcommit_needed(nfsi)) {
+ status = pnfs_layoutcommit_inode(ino, wait);
+ if (status) {
+ /* Return layout even if layoutcommit fails */
+ dprintk("%s: layoutcommit failed, status=%d. "
+ "Returning layout anyway\n",
+ __func__, status);
+ }
+ }
+ status = return_layout(ino, &arg, wait);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
bool pnfs_roc(struct inode *ino)
{
struct pnfs_layout_hdr *lo;
@@ -559,10 +830,24 @@ bool pnfs_roc_drain(struct inode *ino, u
* are seen first.
*/
static s64
-cmp_layout(u32 iomode1, u32 iomode2)
+cmp_layout(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
{
+ s64 d;
+
+ /* higher offset > lower offset */
+ d = l1->offset - l2->offset;
+ if (d)
+ return d;
+
+ /* longer length > shorter length */
+ d = l1->length - l2->length;
+ if (d)
+ return d;
+
/* read > read/write */
- return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+ return (int)(l2->iomode == IOMODE_READ) -
+ (int)(l1->iomode == IOMODE_READ);
}
static void
@@ -576,7 +861,7 @@ pnfs_insert_layout(struct pnfs_layout_hd
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
+ if (cmp_layout(&lp->pls_range, &lseg->pls_range) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -606,7 +891,7 @@ alloc_init_layout_hdr(struct inode *ino)
{
struct pnfs_layout_hdr *lo;
- lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+ lo = pnfs_alloc_layout_hdr(ino);
if (!lo)
return NULL;
atomic_set(&lo->plh_refcount, 1);
@@ -639,13 +924,13 @@ pnfs_find_alloc_layout(struct inode *ino
if (likely(nfsi->layout == NULL)) /* Won the race? */
nfsi->layout = new;
else
- kfree(new);
+ pnfs_free_layout_hdr(new);
return nfsi->layout;
}
/*
* iomode matching rules:
- * iomode lseg match
+ * range lseg match
* ----- ----- -----
* ANY READ true
* ANY RW true
@@ -655,16 +940,28 @@ pnfs_find_alloc_layout(struct inode *ino
* READ RW true
*/
static int
-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+is_matching_lseg(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_range *range)
{
- return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
+ struct pnfs_layout_range range1;
+
+ if ((range->iomode == IOMODE_RW &&
+ lseg->pls_range.iomode != IOMODE_RW) ||
+ !lo_seg_intersecting(&lseg->pls_range, range))
+ return 0;
+
+ /* range1 covers only the first byte in the range */
+ range1 = *range;
+ range1.length = 1;
+ return lo_seg_contained(&lseg->pls_range, &range1);
}
/*
* lookup range in layout
*/
static struct pnfs_layout_segment *
-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
{
struct pnfs_layout_segment *lseg, *ret = NULL;
@@ -673,16 +970,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *l
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
- is_matching_lseg(lseg, iomode)) {
- ret = lseg;
+ is_matching_lseg(lseg, range)) {
+ ret = get_lseg(lseg);
break;
}
- if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
+ if (cmp_layout(range, &lseg->pls_range) > 0)
break;
}
- dprintk("%s:Return lseg %p ref %d\n",
- __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
+ dprintk("%s:Return lseg %p ref %d valid %d\n",
+ __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0,
+ ret ? test_bit(NFS_LSEG_VALID, &ret->pls_flags) : 0);
return ret;
}
@@ -693,12 +991,20 @@ pnfs_find_lseg(struct pnfs_layout_hdr *l
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
enum pnfs_iomode iomode)
{
+ struct pnfs_layout_range arg = {
+ .iomode = iomode,
+ .offset = pos,
+ .length = count,
+ };
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg = NULL;
+ bool first = false;
if (!pnfs_enabled_sb(NFS_SERVER(ino)))
return NULL;
@@ -715,21 +1021,25 @@ pnfs_update_layout(struct inode *ino,
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
- /* Check to see if the layout for the given range already exists */
- lseg = pnfs_find_lseg(lo, iomode);
- if (lseg)
- goto out_unlock;
/* if LAYOUTGET already failed once we don't try again */
if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
+ /* Check to see if the layout for the given range already exists */
+ lseg = pnfs_find_lseg(lo, &arg);
+ if (lseg)
+ goto out_unlock;
+
if (pnfs_layoutgets_blocked(lo, NULL, 0))
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
get_layout_hdr(lo);
- if (list_empty(&lo->plh_segs)) {
+ if (list_empty(&lo->plh_segs))
+ first = true;
+ spin_unlock(&ino->i_lock);
+ if (first) {
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
*/
@@ -738,24 +1048,18 @@ pnfs_update_layout(struct inode *ino,
list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
spin_unlock(&clp->cl_lock);
}
- spin_unlock(&ino->i_lock);
- lseg = send_layoutget(lo, ctx, iomode);
- if (!lseg) {
- spin_lock(&ino->i_lock);
- if (list_empty(&lo->plh_segs)) {
- spin_lock(&clp->cl_lock);
- list_del_init(&lo->plh_layouts);
- spin_unlock(&clp->cl_lock);
- clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- }
- spin_unlock(&ino->i_lock);
+ lseg = send_layoutget(lo, ctx, &arg);
+ if (!lseg && first) {
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
}
atomic_dec(&lo->plh_outstanding);
put_layout_hdr(lo);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
- nfsi->layout->plh_flags, lseg);
+ nfsi->layout->plh_flags ? nfsi->layout->plh_flags : -1, lseg);
return lseg;
out_unlock:
spin_unlock(&ino->i_lock);
@@ -772,17 +1076,6 @@ pnfs_layout_process(struct nfs4_layoutge
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0;
- /* Verify we got what we asked for.
- * Note that because the xdr parsing only accepts a single
- * element array, this can fail even if the server is behaving
- * correctly.
- */
- if (lgp->args.range.iomode > res->range.iomode ||
- res->range.offset != 0 ||
- res->range.length != NFS4_MAX_UINT64) {
- status = -EINVAL;
- goto out;
- }
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
if (!lseg || IS_ERR(lseg)) {
@@ -808,7 +1101,7 @@ pnfs_layout_process(struct nfs4_layoutge
}
init_lseg(lo, lseg);
lseg->pls_range = res->range;
- *lgp->lsegpp = lseg;
+ *lgp->lsegpp = get_lseg(lseg);
pnfs_insert_layout(lo, lseg);
if (res->return_on_close) {
@@ -829,6 +1122,523 @@ out_forget_reply:
goto out;
}
+void
+readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset,
+ size_t *count)
+{
+ struct page *first, *last;
+ loff_t foff, i_size = i_size_read(inode);
+ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ size_t range;
+
+ first = list_entry((pages)->prev, struct page, lru);
+ last = list_entry((pages)->next, struct page, lru);
+
+ foff = (loff_t)first->index << PAGE_CACHE_SHIFT;
+
+ range = (last->index - first->index) * PAGE_CACHE_SIZE;
+ if (last->index == end_index)
+ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+ else
+ range += PAGE_CACHE_SIZE;
+ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff,
+ range);
+ *offset = foff;
+ *count = range;
+}
+
+void
+pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
+{
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layoutdriver_type *ld;
+
+ pgio->pg_test = NULL;
+
+ lo = NFS_I(inode)->layout;
+ ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ if (!ld || !lo)
+ return;
+
+ pgio->pg_test = ld->pg_test;
+}
+
+/*
+ * rsize is already set by caller to MDS rsize.
+ */
+void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode,
+ struct nfs_open_context *ctx,
+ struct list_head *pages,
+ size_t *rsize)
+{
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ size_t count = 0;
+ loff_t loff;
+
+ pgio->pg_iswrite = 0;
+ pgio->pg_test = NULL;
+ pgio->pg_lseg = NULL;
+
+ if (!pnfs_enabled_sb(nfss))
+ return;
+
+ readahead_range(inode, pages, &loff, &count);
+ pgio->pg_lseg = pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ);
+ if (pgio->pg_lseg) {
+ pnfs_set_pg_test(inode, pgio);
+ *rsize = NFS_SERVER(inode)->ds_rsize;
+ }
+}
+
+void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
+ size_t *wsize)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ pgio->pg_iswrite = 1;
+ if (!pnfs_enabled_sb(server))
+ pgio->pg_test = NULL;
+ else {
+ pnfs_set_pg_test(inode, pgio);
+ *wsize = server->ds_wsize;
+ }
+}
+
+/* Set buffer size for data servers */
+void
+pnfs_set_ds_iosize(struct nfs_server *server)
+{
+ unsigned dssize = 0;
+
+ if (server->pnfs_curr_ld && server->pnfs_curr_ld->get_blocksize)
+ dssize = server->pnfs_curr_ld->get_blocksize();
+ if (dssize)
+ server->ds_rsize = server->ds_wsize =
+ nfs_block_size(dssize, NULL);
+ else {
+ server->ds_wsize = server->wsize;
+ server->ds_rsize = server->rsize;
+ }
+}
+
+static int
+pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data)
+{
+ put_lseg(pdata->lseg);
+ pdata->lseg = NULL;
+ pdata->call_ops->rpc_call_done(task, data);
+ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN)
+ return -EAGAIN;
+ if (pdata->pnfsflags & PNFS_NO_RPC) {
+ pdata->call_ops->rpc_release(data);
+ } else {
+ /*
+ * just restore original rpc call ops
+ * rpc_release will be called later by the rpc scheduling layer.
+ */
+ task->tk_ops = pdata->call_ops;
+ }
+ return 0;
+}
+
+/* Post-write completion function
+ * Invoked by all layout drivers when write_pagelist is done.
+ *
+ * NOTE: callers set data->pnfsflags PNFS_NO_RPC
+ * so that the NFS cleanup routines perform only the page cache
+ * cleanup.
+ */
+static void
+pnfs_write_retry(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+ struct pnfs_layout_range range;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+ range.iomode = IOMODE_RW;
+ range.offset = wdata->args.offset;
+ range.length = wdata->args.count;
+ _pnfs_return_layout(wdata->inode, &range, true);
+ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode),
+ wdata->pdata.call_ops, wdata->pdata.how);
+}
+
+void
+pnfs_writeback_done(struct nfs_write_data *data)
+{
+ struct pnfs_call_data *pdata = &data->pdata;
+
+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
+
+ /* update last write offset and need layout commit
+ * for non-files layout types (files layout calls
+ * pnfs4_write_done for this)
+ */
+ if ((pdata->pnfsflags & PNFS_NO_RPC) &&
+ data->task.tk_status >= 0 && data->res.count > 0) {
+ struct nfs_inode *nfsi = NFS_I(data->inode);
+
+ pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
+ pnfs_need_layoutcommit(nfsi, data->args.context);
+ }
+
+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
+ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry);
+ queue_work(nfsiod_workqueue, &data->task.u.tk_work);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_writeback_done);
+
+static void _pnfs_clear_lseg_from_pages(struct list_head *head)
+{
+ struct nfs_page *req;
+
+ list_for_each_entry(req, head, wb_list) {
+ put_lseg(req->wb_lseg);
+ req->wb_lseg = NULL;
+ }
+}
+
+/*
+ * Call the appropriate parallel I/O subsystem write function.
+ * If no I/O device driver exists, or one does match the returned
+ * fstype, then return a positive status for regular NFS processing.
+ *
+ * TODO: Is wdata->how and wdata->args.stable always the same value?
+ * TODO: It seems in NFS, the server may not do a stable write even
+ * though it was requested (and vice-versa?). To check, it looks
+ * in data->res.verf->committed. Do we need this ability
+ * for non-file layout drivers?
+ */
+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ struct inode *inode = wdata->inode;
+ enum pnfs_try_status trypnfs;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg;
+
+ wdata->pdata.call_ops = call_ops;
+ wdata->pdata.pnfs_error = 0;
+ wdata->pdata.how = how;
+
+ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+ inode->i_ino, wdata->args.count, wdata->args.offset, how);
+
+ get_lseg(lseg);
+
+ if (!pnfs_use_rpc(nfss))
+ wdata->pdata.pnfsflags |= PNFS_NO_RPC;
+ wdata->pdata.lseg = lseg;
+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata,
+ nfs_page_array_len(wdata->args.pgbase, wdata->args.count),
+ how);
+
+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
+ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
+ wdata->pdata.lseg = NULL;
+ put_lseg(lseg);
+ _pnfs_clear_lseg_from_pages(&wdata->pages);
+ } else {
+ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+ }
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
+/* Post-read completion function. Invoked by all layout drivers when
+ * read_pagelist is done
+ */
+static void
+pnfs_read_retry(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+ struct pnfs_layout_range range;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+ range.iomode = IOMODE_RW;
+ range.offset = rdata->args.offset;
+ range.length = rdata->args.count;
+ _pnfs_return_layout(rdata->inode, &range, true);
+ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode),
+ rdata->pdata.call_ops);
+}
+
+void
+pnfs_read_done(struct nfs_read_data *data)
+{
+ struct pnfs_call_data *pdata = &data->pdata;
+
+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
+
+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
+ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry);
+ queue_work(nfsiod_workqueue, &data->task.u.tk_work);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_read_done);
+
+/*
+ * Call the appropriate parallel I/O subsystem read function.
+ * If no I/O device driver exists, or one does match the returned
+ * fstype, then return a positive status for regular NFS processing.
+ */
+enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
+ const struct rpc_call_ops *call_ops)
+{
+ struct inode *inode = rdata->inode;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg;
+ enum pnfs_try_status trypnfs;
+
+ rdata->pdata.call_ops = call_ops;
+ rdata->pdata.pnfs_error = 0;
+
+ dprintk("%s: Reading ino:%lu %u@%llu\n",
+ __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+
+ get_lseg(lseg);
+
+ if (!pnfs_use_rpc(nfss))
+ rdata->pdata.pnfsflags |= PNFS_NO_RPC;
+ rdata->pdata.lseg = lseg;
+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata,
+ nfs_page_array_len(rdata->args.pgbase, rdata->args.count));
+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
+ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
+ rdata->pdata.lseg = NULL;
+ put_lseg(lseg);
+ _pnfs_clear_lseg_from_pages(&rdata->pages);
+ } else {
+ nfs_inc_stats(inode, NFSIOS_PNFS_READ);
+ }
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
+/*
+ * This gives the layout driver an opportunity to read in page "around"
+ * the data to be written. It returns 0 on success, otherwise an error code
+ * which will either be passed up to user, or ignored if
+ * some previous part of write succeeded.
+ * Note the range [pos, pos+len-1] is entirely within the page.
+ */
+int _pnfs_write_begin(struct inode *inode, struct page *page,
+ loff_t pos, unsigned len,
+ struct pnfs_layout_segment *lseg,
+ struct pnfs_fsdata **fsdata)
+{
+ struct pnfs_fsdata *data;
+ int status = 0;
+
+ dprintk("--> %s: pos=%llu len=%u\n",
+ __func__, (unsigned long long)pos, len);
+ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
+ if (!data) {
+ status = -ENOMEM;
+ goto out;
+ }
+ data->lseg = lseg; /* refcount passed into data to be managed there */
+ status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin(
+ lseg, page, pos, len, data);
+ if (status) {
+ kfree(data);
+ data = NULL;
+ }
+out:
+ *fsdata = data;
+ dprintk("<-- %s: status=%d\n", __func__, status);
+ return status;
+}
+
+/* pNFS Commit callback function for all layout drivers */
+void
+pnfs_commit_done(struct nfs_write_data *data)
+{
+ struct pnfs_call_data *pdata = &data->pdata;
+
+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
+
+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_RW,
+ .offset = data->args.offset,
+ .length = data->args.count,
+ };
+ dprintk("%s: retrying\n", __func__);
+ _pnfs_return_layout(data->inode, &range, true);
+ pnfs_initiate_commit(data, NFS_CLIENT(data->inode),
+ pdata->call_ops, pdata->how, 1);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_commit_done);
+
+enum pnfs_try_status
+pnfs_try_to_commit(struct nfs_write_data *data,
+ const struct rpc_call_ops *call_ops, int sync)
+{
+ struct inode *inode = data->inode;
+ struct nfs_server *nfss = NFS_SERVER(data->inode);
+ enum pnfs_try_status trypnfs;
+
+ dprintk("%s: Begin\n", __func__);
+
+ if (!pnfs_use_rpc(nfss))
+ data->pdata.pnfsflags |= PNFS_NO_RPC;
+ /* We need to account for possibility that
+ * each nfs_page can point to a different lseg (or be NULL).
+ * For the immediate case of whole-file-only layouts, we at
+ * least know there can be only a single lseg.
+ * We still have to account for the possibility of some being NULL.
+ * This will be done by passing the buck to the layout driver.
+ */
+ data->pdata.call_ops = call_ops;
+ data->pdata.pnfs_error = 0;
+ data->pdata.how = sync;
+ data->pdata.lseg = NULL;
+ trypnfs = nfss->pnfs_curr_ld->commit(data, sync);
+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
+ data->pdata.pnfsflags &= ~PNFS_NO_RPC;
+ _pnfs_clear_lseg_from_pages(&data->pages);
+ } else
+ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT);
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
+void pnfs_cleanup_layoutcommit(struct inode *inode,
+ struct nfs4_layoutcommit_data *data)
+{
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ /* TODO: Maybe we should avoid this by allowing the layout driver
+ * to directly xdr its layout on the wire.
+ */
+ if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+ nfss->pnfs_curr_ld->cleanup_layoutcommit(
+ NFS_I(inode)->layout, data);
+}
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int
+pnfs_setup_layoutcommit(struct inode *inode,
+ struct nfs4_layoutcommit_data *data,
+ loff_t write_begin_pos, loff_t write_end_pos)
+{
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ int result = 0;
+
+ dprintk("--> %s\n", __func__);
+
+ data->args.inode = inode;
+ data->args.fh = NFS_FH(inode);
+ data->args.layout_type = nfss->pnfs_curr_ld->id;
+ data->res.fattr = &data->fattr;
+ nfs_fattr_init(&data->fattr);
+
+ /* TODO: Need to determine the correct values */
+ data->args.time_modify_changed = 0;
+
+ /* Set values from inode so it can be reset
+ */
+ data->args.range.iomode = IOMODE_RW;
+ data->args.range.offset = write_begin_pos;
+ data->args.range.length = write_end_pos - write_begin_pos + 1;
+ data->args.lastbytewritten = min(write_end_pos,
+ i_size_read(inode) - 1);
+ data->args.bitmask = nfss->attr_bitmask;
+ data->res.server = nfss;
+
+ /* Call layout driver to set the arguments */
+ if (nfss->pnfs_curr_ld->setup_layoutcommit)
+ result = nfss->pnfs_curr_ld->setup_layoutcommit(
+ NFS_I(inode)->layout, &data->args);
+
+ dprintk("<-- %s Status %d\n", __func__, result);
+ return result;
+}
+
+/* Issue a async layoutcommit for an inode.
+ */
+int
+pnfs_layoutcommit_inode(struct inode *inode, int sync)
+{
+ struct nfs4_layoutcommit_data *data;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t write_begin_pos;
+ loff_t write_end_pos;
+
+ int status = 0;
+
+ dprintk("%s Begin (sync:%d)\n", __func__, sync);
+
+ BUG_ON(!has_layout(nfsi));
+
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+
+ spin_lock(&inode->i_lock);
+ if (!layoutcommit_needed(nfsi)) {
+ spin_unlock(&inode->i_lock);
+ goto out_free;
+ }
+
+ /* Clear layoutcommit properties in the inode so
+ * new lc info can be generated
+ */
+ write_begin_pos = nfsi->layout->write_begin_pos;
+ write_end_pos = nfsi->layout->write_end_pos;
+ data->cred = nfsi->layout->cred;
+ nfsi->layout->write_begin_pos = 0;
+ nfsi->layout->write_end_pos = 0;
+ nfsi->layout->cred = NULL;
+ __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
+ memcpy(data->args.stateid.data, nfsi->layout->plh_stateid.data,
+ NFS4_STATEID_SIZE);
+
+ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */
+ get_layout_hdr(NFS_I(inode)->layout);
+
+ spin_unlock(&inode->i_lock);
+
+ /* Set up layout commit args */
+ status = pnfs_setup_layoutcommit(inode, data, write_begin_pos,
+ write_end_pos);
+ if (status) {
+ /* The layout driver failed to setup the layoutcommit */
+ put_rpccred(data->cred);
+ put_layout_hdr(NFS_I(inode)->layout);
+ goto out_free;
+ }
+ status = nfs4_proc_layoutcommit(data, sync);
+out:
+ dprintk("%s end (err:%d)\n", __func__, status);
+ return status;
+out_free:
+ kfree(data);
+ goto out;
+}
+
+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
+{
+ /* lseg refcounting handled directly in nfs_write_end */
+ kfree(fsdata);
+}
+
/*
* Device ID cache. Currently supports one layout type per struct nfs_client.
* Add layout type to the lookup key to expand to support multiple types.
@@ -861,6 +1671,25 @@ pnfs_alloc_init_deviceid_cache(struct nf
}
EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
+/* Must be called with locked c->dc_lock */
+static struct pnfs_deviceid_node *
+pnfs_unhash_deviceid(struct pnfs_deviceid_cache *c,
+ struct nfs4_deviceid *id)
+{
+ struct pnfs_deviceid_node *d;
+ struct hlist_node *n;
+ long h = nfs4_deviceid_hash(id);
+
+ dprintk("%s hash %ld\n", __func__, h);
+ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+ if (!memcmp(&d->de_id, id, sizeof(*id))) {
+ hlist_del_rcu(&d->de_node);
+ return d;
+ }
+
+ return NULL;
+}
+
/*
* Called from pnfs_layoutdriver_type->free_lseg
* last layout segment reference frees deviceid
@@ -869,29 +1698,33 @@ void
pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
struct pnfs_deviceid_node *devid)
{
- struct nfs4_deviceid *id = &devid->de_id;
- struct pnfs_deviceid_node *d;
- struct hlist_node *n;
- long h = nfs4_deviceid_hash(id);
-
dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
return;
- hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
- if (!memcmp(&d->de_id, id, sizeof(*id))) {
- hlist_del_rcu(&d->de_node);
- spin_unlock(&c->dc_lock);
- synchronize_rcu();
- c->dc_free_callback(devid);
- return;
- }
+ pnfs_unhash_deviceid(c, &devid->de_id);
spin_unlock(&c->dc_lock);
- /* Why wasn't it found in the list? */
- BUG();
+ synchronize_rcu();
+ c->dc_free_callback(devid);
}
EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+void
+pnfs_delete_deviceid(struct pnfs_deviceid_cache *c,
+ struct nfs4_deviceid *id)
+{
+ struct pnfs_deviceid_node *devid;
+
+ spin_lock(&c->dc_lock);
+ devid = pnfs_unhash_deviceid(c, id);
+ spin_unlock(&c->dc_lock);
+ synchronize_rcu();
+ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+ if (atomic_dec_and_test(&devid->de_ref))
+ c->dc_free_callback(devid);
+}
+EXPORT_SYMBOL_GPL(pnfs_delete_deviceid);
+
/* Find and reference a deviceid */
struct pnfs_deviceid_node *
pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
diff -up linux-2.6.38.noarch/fs/nfs/pnfs.h.orig linux-2.6.38.noarch/fs/nfs/pnfs.h
--- linux-2.6.38.noarch/fs/nfs/pnfs.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/pnfs.h 2011-03-26 07:57:44.272821329 -0400
@@ -30,6 +30,9 @@
#ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H
+#include <linux/nfs_page.h>
+#include "callback.h"
+
enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
NFS_LSEG_ROC, /* roc bit received from server */
@@ -43,6 +46,17 @@ struct pnfs_layout_segment {
struct pnfs_layout_hdr *pls_layout;
};
+enum pnfs_try_status {
+ PNFS_ATTEMPTED = 0,
+ PNFS_NOT_ATTEMPTED = 1,
+};
+
+struct pnfs_fsdata {
+ struct pnfs_layout_segment *lseg;
+ int bypass_eof;
+ void *private;
+};
+
#ifdef CONFIG_NFS_V4_1
#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -51,20 +65,87 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
+ NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */
NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
+enum layoutdriver_policy_flags {
+ /* Should the full nfs rpc cleanup code be used after io */
+ PNFS_USE_RPC_CODE = 1 << 0,
+
+ /* Should the pNFS client commit and return the layout upon a setattr */
+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 1,
+};
+
/* Per-layout driver specific registration structure */
struct pnfs_layoutdriver_type {
struct list_head pnfs_tblid;
const u32 id;
const char *name;
struct module *owner;
- int (*set_layoutdriver) (struct nfs_server *);
+ unsigned flags;
+ int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
int (*clear_layoutdriver) (struct nfs_server *);
+
+ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode);
+ void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+
+ /* test for nfs page cache coalescing */
+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+
+ /* Retreive the block size of the file system.
+ * If gather_across_stripes == 1, then the file system will gather
+ * requests into the block size.
+ * TODO: Where will the layout driver get this info? It is hard
+ * coded in PVFS2.
+ */
+ ssize_t (*get_blocksize) (void);
+
+/* read and write pagelist should return just 0 (to indicate that
+ * the layout code has taken control) or 1 (to indicate that the
+ * layout code wishes to fall back to normal nfs.) If 0 is returned,
+ * information can be passed back through nfs_data->res and
+ * nfs_data->task.tk_status, and the appropriate pnfs done function
+ * MUST be called.
+ */
+ enum pnfs_try_status
+ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages);
+ enum pnfs_try_status
+ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how);
+ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
+ loff_t pos, unsigned count,
+ struct pnfs_fsdata *fsdata);
+ int (*write_end)(struct inode *inode, struct page *page, loff_t pos,
+ unsigned count, unsigned copied,
+ struct pnfs_layout_segment *lseg);
+ void (*write_end_cleanup)(struct file *filp,
+ struct pnfs_fsdata *fsdata);
+
+ /* Consistency ops */
+ /* 2 problems:
+ * 1) the page list contains nfs_pages, NOT pages
+ * 2) currently the NFS code doesn't create a page array (as it does with read/write)
+ */
+ enum pnfs_try_status
+ (*commit) (struct nfs_write_data *nfs_data, int how);
+
+ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct nfs4_layoutcommit_args *args);
+
+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *args);
+
+ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct nfs4_layoutcommit_data *data);
+
+ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args);
};
struct pnfs_layout_hdr {
@@ -72,11 +153,18 @@ struct pnfs_layout_hdr {
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
struct list_head plh_segs; /* layout segments list */
+ int roc_iomode;/* return on close iomode, 0=none */
nfs4_stateid plh_stateid;
atomic_t plh_outstanding; /* number of RPCs out */
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_flags;
+ struct rpc_cred *cred; /* layoutcommit credential */
+ /* DH: These vars keep track of the maximum write range
+ * so the values can be used for layoutcommit.
+ */
+ loff_t write_begin_pos;
+ loff_t write_end_pos;
struct inode *plh_inode;
};
@@ -90,6 +178,14 @@ struct pnfs_device {
unsigned int pglen;
};
+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+
+struct pnfs_devicelist {
+ unsigned int eof;
+ unsigned int num_devs;
+ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
+};
+
/*
* Device ID RCU cache. A device ID is unique per client ID and layout type.
*/
@@ -135,22 +231,52 @@ extern struct pnfs_deviceid_node *pnfs_a
struct pnfs_deviceid_node *);
extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
struct pnfs_deviceid_node *devid);
+extern void pnfs_delete_deviceid(struct pnfs_deviceid_cache *,
+ struct nfs4_deviceid *);
extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
/* nfs4proc.c */
+extern int nfs4_proc_getdevicelist(struct nfs_server *server,
+ const struct nfs_fh *fh,
+ struct pnfs_devicelist *devlist);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev);
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
+ int issync);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
/* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
+bool should_free_lseg(struct pnfs_layout_range *lseg_range,
+ struct pnfs_layout_range *recall_range);
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
- enum pnfs_iomode access_type);
-void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+ loff_t pos, u64 count, enum pnfs_iomode access_type);
+int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+ const struct rpc_call_ops *, int);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+ const struct rpc_call_ops *);
+void pnfs_cleanup_layoutcommit(struct inode *,
+ struct nfs4_layoutcommit_data *);
+int pnfs_layoutcommit_inode(struct inode *inode, int sync);
+void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent);
+void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx);
+void pnfs_set_ds_iosize(struct nfs_server *server);
+enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
+ const struct rpc_call_ops *, int);
+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
+ struct nfs_open_context *, struct list_head *,
+ size_t *);
+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
+ size_t *);
+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
@@ -162,14 +288,26 @@ void pnfs_set_layout_stateid(struct pnfs
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state);
+void pnfs_read_done(struct nfs_read_data *);
+void pnfs_writeback_done(struct nfs_write_data *);
+void pnfs_commit_done(struct nfs_write_data *);
+int _pnfs_write_begin(struct inode *inode, struct page *page,
+ loff_t pos, unsigned len,
+ struct pnfs_layout_segment *lseg,
+ struct pnfs_fsdata **fsdata);
int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- u32 iomode);
+ struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
+static inline bool
+has_layout(struct nfs_inode *nfsi)
+{
+ return nfsi->layout != NULL;
+}
static inline int lo_fail_bit(u32 iomode)
{
@@ -177,12 +315,141 @@ static inline int lo_fail_bit(u32 iomode
NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
}
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+ if (lseg) {
+ atomic_inc(&lseg->pls_refcount);
+ smp_mb__after_atomic_inc();
+ }
+ return lseg;
+}
+
/* Return true if a layout driver is being used for this mountpoint */
static inline int pnfs_enabled_sb(struct nfs_server *nfss)
{
return nfss->pnfs_curr_ld != NULL;
}
+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
+ struct pnfs_fsdata *fsdata)
+{
+ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) ||
+ !fsdata->bypass_eof;
+}
+
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_SETATTR;
+}
+
+static inline bool pnfs_use_rpc(struct nfs_server *nfss)
+{
+ if (pnfs_enabled_sb(nfss))
+ return nfss->pnfs_curr_ld->flags & PNFS_USE_RPC_CODE;
+
+ return true;
+}
+
+/* Should the pNFS client commit and return the layout on close
+ */
+static inline int
+pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
+{
+ return nfsi->layout->roc_iomode;
+}
+
+static inline int pnfs_write_begin(struct file *filp, struct page *page,
+ loff_t pos, unsigned len,
+ struct pnfs_layout_segment *lseg,
+ void **fsdata)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ int status = 0;
+
+ *fsdata = lseg;
+ if (lseg && nfss->pnfs_curr_ld->write_begin)
+ status = _pnfs_write_begin(inode, page, pos, len, lseg,
+ (struct pnfs_fsdata **) fsdata);
+ return status;
+}
+
+/* CAREFUL - what happens if copied < len??? */
+static inline int pnfs_write_end(struct file *filp, struct page *page,
+ loff_t pos, unsigned len, unsigned copied,
+ struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end)
+ return nfss->pnfs_curr_ld->write_end(inode, page, pos, len,
+ copied, lseg);
+ else
+ return 0;
+}
+
+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
+{
+ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
+
+ if (fsdata && nfss->pnfs_curr_ld) {
+ if (nfss->pnfs_curr_ld->write_end_cleanup)
+ nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata);
+ if (nfss->pnfs_curr_ld->write_begin)
+ pnfs_free_fsdata(fsdata);
+ }
+}
+
+static inline int pnfs_return_layout(struct inode *ino,
+ struct pnfs_layout_range *range,
+ bool wait)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_server *nfss = NFS_SERVER(ino);
+
+ if (pnfs_enabled_sb(nfss) && has_layout(nfsi))
+ return _pnfs_return_layout(ino, range, wait);
+
+ return 0;
+}
+
+static inline bool
+layoutcommit_needed(struct nfs_inode *nfsi)
+{
+ return has_layout(nfsi) &&
+ test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
+}
+
+static inline int pnfs_get_write_status(struct nfs_write_data *data)
+{
+ return data->pdata.pnfs_error;
+}
+
+static inline int pnfs_get_read_status(struct nfs_read_data *data)
+{
+ return data->pdata.pnfs_error;
+}
+
+static inline struct pnfs_layout_segment *
+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
+{
+ if (fsdata) {
+ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
+
+ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin)
+ return ((struct pnfs_fsdata *) fsdata)->lseg;
+ return (struct pnfs_layout_segment *)fsdata;
+ }
+ return NULL;
+}
+
#else /* CONFIG_NFS_V4_1 */
static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -194,18 +461,66 @@ static inline void pnfs_destroy_layout(s
}
static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+ return NULL;
+}
+
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
- enum pnfs_iomode access_type)
+ loff_t pos, u64 count, enum pnfs_iomode access_type)
{
return NULL;
}
static inline bool
-pnfs_roc(struct inode *ino)
+has_layout(struct nfs_inode *nfsi)
{
return false;
}
+static inline bool
+layoutcommit_needed(struct nfs_inode *nfsi)
+{
+ return 0;
+}
+
+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
+ struct pnfs_fsdata *fsdata)
+{
+ return 1;
+}
+
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+ const struct rpc_call_ops *call_ops)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static inline enum pnfs_try_status
+pnfs_try_to_commit(struct nfs_write_data *data,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync)
+{
+ return 0;
+}
+
static inline void
pnfs_roc_release(struct inode *ino)
{
@@ -222,7 +537,37 @@ pnfs_roc_drain(struct inode *ino, u32 *b
return false;
}
-static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+ return false;
+}
+
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ return false;
+}
+
+static inline bool pnfs_use_rpc(struct nfs_server *nfss)
+{
+ return true;
+}
+
+static inline int
+pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
+{
+ return 0;
+}
+
+static inline int pnfs_return_layout(struct inode *ino,
+ struct pnfs_layout_range *range,
+ bool wait)
+{
+ return 0;
+}
+
+static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id)
{
}
@@ -230,6 +575,62 @@ static inline void unset_pnfs_layoutdriv
{
}
+static inline void pnfs_set_ds_iosize(struct nfs_server *server)
+{
+ server->ds_wsize = server->ds_rsize = -1;
+}
+
+static inline int pnfs_write_begin(struct file *filp, struct page *page,
+ loff_t pos, unsigned len,
+ struct pnfs_layout_segment *lseg,
+ void **fsdata)
+{
+ *fsdata = NULL;
+ return 0;
+}
+
+static inline int pnfs_write_end(struct file *filp, struct page *page,
+ loff_t pos, unsigned len, unsigned copied,
+ struct pnfs_layout_segment *lseg)
+{
+ return 0;
+}
+
+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
+{
+}
+
+static inline int pnfs_get_write_status(struct nfs_write_data *data)
+{
+ return 0;
+}
+
+static inline int pnfs_get_read_status(struct nfs_read_data *data)
+{
+ return 0;
+}
+
+static inline void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino,
+ struct nfs_open_context *ctx, struct list_head *pages,
+ size_t *rsize)
+{
+ pgio->pg_lseg = NULL;
+}
+
+static inline void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino,
+ size_t *wsize)
+{
+ pgio->pg_lseg = NULL;
+}
+
+static inline struct pnfs_layout_segment *
+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
+{
+ return NULL;
+}
+
#endif /* CONFIG_NFS_V4_1 */
#endif /* FS_NFS_PNFS_H */
diff -up linux-2.6.38.noarch/fs/nfs/read.c.orig linux-2.6.38.noarch/fs/nfs/read.c
--- linux-2.6.38.noarch/fs/nfs/read.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/read.c 2011-03-26 07:57:44.273821320 -0400
@@ -18,14 +18,17 @@
#include <linux/sunrpc/clnt.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
#include <asm/system.h>
+#include <linux/module.h>
+#include "pnfs.h"
#include "nfs4_fs.h"
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
-#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -117,12 +120,16 @@ int nfs_readpage_async(struct nfs_open_c
LIST_HEAD(one_request);
struct nfs_page *new;
unsigned int len;
+ loff_t pgoffs;
+ struct pnfs_layout_segment *lseg;
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
- pnfs_update_layout(inode, ctx, IOMODE_READ);
- new = nfs_create_request(ctx, inode, page, 0, len);
+ pgoffs = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ lseg = pnfs_update_layout(inode, ctx, pgoffs, len, IOMODE_READ);
+ new = nfs_create_request(ctx, inode, page, 0, len, lseg);
+ put_lseg(lseg);
if (IS_ERR(new)) {
unlock_page(page);
return PTR_ERR(new);
@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct
nfs_release_request(req);
}
-/*
- * Set up the NFS read request struct
- */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops,
- unsigned int count, unsigned int offset)
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = data->inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
- .rpc_cred = req->wb_context->cred,
+ .rpc_cred = data->cred,
};
struct rpc_task_setup task_setup_data = {
.task = &data->task,
- .rpc_client = NFS_CLIENT(inode),
+ .rpc_client = clnt,
.rpc_message = &msg,
.callback_ops = call_ops,
.callback_data = data,
@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_
.flags = RPC_TASK_ASYNC | swap_flags,
};
+ /* Set up the initial task struct. */
+ NFS_PROTO(inode)->read_setup(data, &msg);
+
+ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL(nfs_initiate_read);
+
+int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops)
+{
+ if (data->req->wb_lseg &&
+ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+ return pnfs_get_read_status(data);
+
+ return nfs_initiate_read(data, clnt, call_ops);
+}
+
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+ const struct rpc_call_ops *call_ops,
+ unsigned int count, unsigned int offset)
+{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+
data->req = req;
data->inode = inode;
- data->cred = msg.rpc_cred;
+ data->cred = req->wb_context->cred;
data->args.fh = NFS_FH(inode);
data->args.offset = req_offset(req) + offset;
@@ -197,21 +237,7 @@ static int nfs_read_rpcsetup(struct nfs_
data->res.eof = 0;
nfs_fattr_init(&data->fattr);
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->read_setup(data, &msg);
-
- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- count,
- (unsigned long long)data->args.offset);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- return PTR_ERR(task);
- rpc_put_task(task);
- return 0;
+ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
}
static void
@@ -355,7 +381,14 @@ static void nfs_readpage_retry(struct rp
{
struct nfs_readargs *argp = &data->args;
struct nfs_readres *resp = &data->res;
+ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client;
+#ifdef CONFIG_NFS_V4_1
+ if (data->fldata.ds_nfs_client) {
+ dprintk("%s DS read\n", __func__);
+ clp = data->fldata.ds_nfs_client;
+ }
+#endif /* CONFIG_NFS_V4_1 */
if (resp->eof || resp->count == argp->count)
return;
@@ -369,7 +402,10 @@ static void nfs_readpage_retry(struct rp
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+#ifdef CONFIG_NFS_V4_1
+ data->pdata.pnfs_error = -EAGAIN;
+#endif /* CONFIG_NFS_V4_1 */
+ nfs_restart_rpc(task, clp);
}
/*
@@ -410,13 +446,19 @@ static void nfs_readpage_release_partial
void nfs_read_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_read_data *data = calldata;
+ struct nfs4_session *ds_session = NULL;
- if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+ if (data->fldata.ds_nfs_client) {
+ dprintk("%s DS read\n", __func__);
+ ds_session = data->fldata.ds_nfs_client->cl_session;
+ }
+ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
&data->args.seq_args, &data->res.seq_res,
0, task))
return;
rpc_call_start(task);
}
+EXPORT_SYMBOL(nfs_read_prepare);
#endif /* CONFIG_NFS_V4_1 */
static const struct rpc_call_ops nfs_read_partial_ops = {
@@ -569,7 +611,20 @@ readpage_async_filler(void *data, struct
if (len == 0)
return nfs_return_empty_page(page);
- new = nfs_create_request(desc->ctx, inode, page, 0, len);
+ if (desc->pgio->pg_lseg) {
+ loff_t pgoff = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ struct pnfs_layout_range *range = &desc->pgio->pg_lseg->pls_range;
+
+ /* retry later with the right lseg? */
+ if (range->offset > pgoff + len ||
+ range->offset + range->length < pgoff) {
+ new = ERR_PTR(-EAGAIN);
+ goto out_error;
+ }
+ }
+
+ new = nfs_create_request(desc->ctx, inode, page, 0, len,
+ desc->pgio->pg_lseg);
if (IS_ERR(new))
goto out_error;
@@ -625,7 +680,7 @@ int nfs_readpages(struct file *filp, str
if (ret == 0)
goto read_complete; /* all pages were read */
- pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
+ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize);
if (rsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
else
@@ -634,6 +689,7 @@ int nfs_readpages(struct file *filp, str
ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
nfs_pageio_complete(&pgio);
+ put_lseg(pgio.pg_lseg);
npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
nfs_add_stats(inode, NFSIOS_READPAGES, npages);
read_complete:
diff -up linux-2.6.38.noarch/fs/nfs/super.c.orig linux-2.6.38.noarch/fs/nfs/super.c
--- linux-2.6.38.noarch/fs/nfs/super.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/super.c 2011-03-26 07:57:44.275821302 -0400
@@ -63,6 +63,7 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -725,6 +726,28 @@ static int nfs_show_options(struct seq_f
return 0;
}
+#ifdef CONFIG_NFS_V4_1
+void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+ if (nfs4_has_session(server->nfs_client))
+ seq_printf(m, ",sessions");
+}
+#else
+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+ seq_printf(m, ",pnfs=");
+ if (server->pnfs_curr_ld)
+ seq_printf(m, "%s", server->pnfs_curr_ld->name);
+ else
+ seq_printf(m, "not configured");
+}
+#else /* CONFIG_NFS_V4_1 */
+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */
/*
* Present statistical information for this VFS mountpoint
@@ -763,6 +786,8 @@ static int nfs_show_stats(struct seq_fil
seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+ show_sessions(m, nfss);
+ show_pnfs(m, nfss);
}
#endif
diff -up linux-2.6.38.noarch/fs/nfs/unlink.c.orig linux-2.6.38.noarch/fs/nfs/unlink.c
--- linux-2.6.38.noarch/fs/nfs/unlink.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/unlink.c 2011-03-26 07:57:44.276821293 -0400
@@ -113,7 +113,7 @@ void nfs_unlink_prepare(struct rpc_task
struct nfs_unlinkdata *data = calldata;
struct nfs_server *server = NFS_SERVER(data->dir);
- if (nfs4_setup_sequence(server, &data->args.seq_args,
+ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args,
&data->res.seq_res, 1, task))
return;
rpc_call_start(task);
@@ -184,19 +184,17 @@ static int nfs_do_call_unlink(struct den
return 1;
}
-static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
+static int nfs_call_unlink(struct dentry *parent, struct dentry *dentry, struct nfs_unlinkdata *data)
{
- struct dentry *parent;
struct inode *dir;
int ret = 0;
- parent = dget_parent(dentry);
if (parent == NULL)
- goto out_free;
+ goto out;
dir = parent->d_inode;
if (nfs_copy_dname(dentry, data) != 0)
- goto out_dput;
+ goto out;
/* Non-exclusive lock protects against concurrent lookup() calls */
spin_lock(&dir->i_lock);
if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
@@ -204,13 +202,11 @@ static int nfs_call_unlink(struct dentry
hlist_add_head(&data->list, &NFS_I(dir)->silly_list);
spin_unlock(&dir->i_lock);
ret = 1;
- goto out_dput;
+ goto out;
}
spin_unlock(&dir->i_lock);
ret = nfs_do_call_unlink(parent, dir, data);
-out_dput:
- dput(parent);
-out_free:
+out:
return ret;
}
@@ -283,26 +279,24 @@ out:
/**
* nfs_complete_unlink - Initialize completion of the sillydelete
+ * @parent: parent directory
* @dentry: dentry to delete
- * @inode: inode
*
* Since we're most likely to be called by dentry_iput(), we
* only use the dentry to find the sillydelete. We then copy the name
* into the qstr.
*/
void
-nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
+nfs_complete_unlink(struct dentry *parent, struct dentry *dentry)
{
struct nfs_unlinkdata *data = NULL;
- spin_lock(&dentry->d_lock);
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
data = dentry->d_fsdata;
}
- spin_unlock(&dentry->d_lock);
- if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
+ if (data != NULL && !nfs_call_unlink(parent, dentry, data))
nfs_free_unlinkdata(data);
}
@@ -388,7 +382,7 @@ static void nfs_rename_prepare(struct rp
struct nfs_renamedata *data = calldata;
struct nfs_server *server = NFS_SERVER(data->old_dir);
- if (nfs4_setup_sequence(server, &data->args.seq_args,
+ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args,
&data->res.seq_res, 1, task))
return;
rpc_call_start(task);
diff -up linux-2.6.38.noarch/fs/nfs/write.c.orig linux-2.6.38.noarch/fs/nfs/write.c
--- linux-2.6.38.noarch/fs/nfs/write.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/fs/nfs/write.c 2011-03-26 07:57:44.277821284 -0400
@@ -28,6 +28,7 @@
#include "iostat.h"
#include "nfs4_fs.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -58,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_al
}
return p;
}
+EXPORT_SYMBOL(nfs_commitdata_alloc);
void nfs_commit_free(struct nfs_write_data *p)
{
@@ -426,6 +428,17 @@ static void nfs_inode_remove_request(str
spin_unlock(&inode->i_lock);
nfs_release_request(req);
}
+static void
+nfs_mark_request_nopnfs(struct nfs_page *req)
+{
+ struct pnfs_layout_segment *lseg = req->wb_lseg;
+
+ if (req->wb_lseg == NULL)
+ return;
+ req->wb_lseg = NULL;
+ put_lseg(lseg);
+ dprintk(" retry through MDS\n");
+}
static void
nfs_mark_request_dirty(struct nfs_page *req)
@@ -531,7 +544,7 @@ nfs_need_commit(struct nfs_inode *nfsi)
* The requests are *not* checked to ensure that they form a contiguous set.
*/
static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
+nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs)
{
struct nfs_inode *nfsi = NFS_I(inode);
int ret;
@@ -539,7 +552,8 @@ nfs_scan_commit(struct inode *inode, str
if (!nfs_need_commit(nfsi))
return 0;
- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT,
+ use_pnfs);
if (ret > 0)
nfsi->ncommit -= ret;
if (nfs_need_commit(NFS_I(inode)))
@@ -568,7 +582,8 @@ static inline int nfs_scan_commit(struct
static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
struct page *page,
unsigned int offset,
- unsigned int bytes)
+ unsigned int bytes,
+ struct pnfs_layout_segment *lseg)
{
struct nfs_page *req;
unsigned int rqend;
@@ -593,8 +608,8 @@ static struct nfs_page *nfs_try_to_updat
* Note: nfs_flush_incompatible() will already
* have flushed out requests having wrong owners.
*/
- if (offset > rqend
- || end < req->wb_offset)
+ if (offset > rqend || end < req->wb_offset ||
+ req->wb_lseg != lseg)
goto out_flushme;
if (nfs_set_page_tag_locked(req))
@@ -642,16 +657,17 @@ out_err:
* already called nfs_flush_incompatible() if necessary.
*/
static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
- struct page *page, unsigned int offset, unsigned int bytes)
+ struct page *page, unsigned int offset, unsigned int bytes,
+ struct pnfs_layout_segment *lseg)
{
struct inode *inode = page->mapping->host;
struct nfs_page *req;
int error;
- req = nfs_try_to_update_request(inode, page, offset, bytes);
+ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg);
if (req != NULL)
goto out;
- req = nfs_create_request(ctx, inode, page, offset, bytes);
+ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg);
if (IS_ERR(req))
goto out;
error = nfs_inode_add_request(inode, req);
@@ -664,23 +680,27 @@ out:
}
static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
- unsigned int offset, unsigned int count)
+ unsigned int offset, unsigned int count,
+ struct pnfs_layout_segment *lseg,
+ void *fsdata)
{
struct nfs_page *req;
- req = nfs_setup_write_request(ctx, page, offset, count);
+ req = nfs_setup_write_request(ctx, page, offset, count, lseg);
if (IS_ERR(req))
return PTR_ERR(req);
nfs_mark_request_dirty(req);
/* Update file length */
- nfs_grow_file(page, offset, count);
+ if (pnfs_grow_ok(lseg, fsdata))
+ nfs_grow_file(page, offset, count);
nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
nfs_mark_request_dirty(req);
nfs_clear_page_tag_locked(req);
return 0;
}
-int nfs_flush_incompatible(struct file *file, struct page *page)
+int nfs_flush_incompatible(struct file *file, struct page *page,
+ struct pnfs_layout_segment *lseg)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct nfs_page *req;
@@ -699,7 +719,8 @@ int nfs_flush_incompatible(struct file *
return 0;
do_flush = req->wb_page != page || req->wb_context != ctx ||
req->wb_lock_context->lockowner != current->files ||
- req->wb_lock_context->pid != current->tgid;
+ req->wb_lock_context->pid != current->tgid ||
+ req->wb_lseg != lseg;
nfs_release_request(req);
if (!do_flush)
return 0;
@@ -726,7 +747,8 @@ static int nfs_write_pageuptodate(struct
* things with a page scheduled for an RPC call (e.g. invalidate it).
*/
int nfs_updatepage(struct file *file, struct page *page,
- unsigned int offset, unsigned int count)
+ unsigned int offset, unsigned int count,
+ struct pnfs_layout_segment *lseg, void *fsdata)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = page->mapping->host;
@@ -751,7 +773,7 @@ int nfs_updatepage(struct file *file, st
offset = 0;
}
- status = nfs_writepage_setup(ctx, page, offset, count);
+ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata);
if (status < 0)
nfs_set_pageerror(page);
@@ -781,25 +803,21 @@ static int flush_task_priority(int how)
return RPC_PRIORITY_NORMAL;
}
-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static int nfs_write_rpcsetup(struct nfs_page *req,
- struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- unsigned int count, unsigned int offset,
- int how)
+int nfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = data->inode;
int priority = flush_task_priority(how);
struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
- .rpc_cred = req->wb_context->cred,
+ .rpc_cred = data->cred,
};
struct rpc_task_setup task_setup_data = {
- .rpc_client = NFS_CLIENT(inode),
+ .rpc_client = clnt,
.task = &data->task,
.rpc_message = &msg,
.callback_ops = call_ops,
@@ -810,12 +828,62 @@ static int nfs_write_rpcsetup(struct nfs
};
int ret = 0;
+ /* Set up the initial task struct. */
+ NFS_PROTO(inode)->write_setup(data, &msg);
+
+ dprintk("NFS: %5u initiated write call "
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto out;
+ }
+ if (how & FLUSH_SYNC) {
+ ret = rpc_wait_for_completion_task(task);
+ if (ret == 0)
+ ret = task->tk_status;
+ }
+ rpc_put_task(task);
+out:
+ return ret;
+}
+EXPORT_SYMBOL(nfs_initiate_write);
+
+int pnfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how)
+{
+ if (data->req->wb_lseg &&
+ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+ return pnfs_get_write_status(data);
+
+ return nfs_initiate_write(data, clnt, call_ops, how);
+}
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+ struct nfs_write_data *data,
+ const struct rpc_call_ops *call_ops,
+ unsigned int count, unsigned int offset,
+ int how)
+{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
data->req = req;
data->inode = inode = req->wb_context->path.dentry->d_inode;
- data->cred = msg.rpc_cred;
+ data->cred = req->wb_context->cred;
data->args.fh = NFS_FH(inode);
data->args.offset = req_offset(req) + offset;
@@ -836,30 +904,7 @@ static int nfs_write_rpcsetup(struct nfs
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->write_setup(data, &msg);
-
- dprintk("NFS: %5u initiated write call "
- "(req %s/%lld, %u bytes @ offset %llu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- count,
- (unsigned long long)data->args.offset);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- ret = PTR_ERR(task);
- goto out;
- }
- if (how & FLUSH_SYNC) {
- ret = rpc_wait_for_completion_task(task);
- if (ret == 0)
- ret = task->tk_status;
- }
- rpc_put_task(task);
-out:
- return ret;
+ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
}
/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -870,6 +915,7 @@ static void nfs_redirty_request(struct n
{
struct page *page = req->wb_page;
+ nfs_mark_request_nopnfs(req);
nfs_mark_request_dirty(req);
nfs_clear_page_tag_locked(req);
nfs_end_page_writeback(page);
@@ -982,6 +1028,8 @@ static void nfs_pageio_init_write(struct
{
size_t wsize = NFS_SERVER(inode)->wsize;
+ pnfs_pageio_init_write(pgio, inode, &wsize);
+
if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
else
@@ -1047,13 +1095,27 @@ out:
void nfs_write_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
+ struct nfs4_session *ds_session = NULL;
- if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+ if (data->fldata.ds_nfs_client) {
+ dprintk("%s DS read\n", __func__);
+ ds_session = data->fldata.ds_nfs_client->cl_session;
+ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) {
+ /* retrying via MDS? */
+ data->pdata.orig_count = data->args.count;
+ data->args.count = NFS_SERVER(data->inode)->wsize;
+ dprintk("%s: trimmed count %u to wsize %u\n", __func__,
+ data->pdata.orig_count, data->args.count);
+ } else
+ data->pdata.orig_count = 0;
+
+ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
&data->args.seq_args,
&data->res.seq_res, 1, task))
return;
rpc_call_start(task);
}
+EXPORT_SYMBOL(nfs_write_prepare);
#endif /* CONFIG_NFS_V4_1 */
static const struct rpc_call_ops nfs_write_partial_ops = {
@@ -1137,10 +1199,11 @@ int nfs_writeback_done(struct rpc_task *
struct nfs_writeargs *argp = &data->args;
struct nfs_writeres *resp = &data->res;
struct nfs_server *server = NFS_SERVER(data->inode);
+ struct nfs_client *clp = server->nfs_client;
int status;
- dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
- task->tk_pid, task->tk_status);
+ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n",
+ task->tk_pid, task->tk_status, resp->count);
/*
* ->write_done will attempt to use post-op attributes to detect
@@ -1153,6 +1216,13 @@ int nfs_writeback_done(struct rpc_task *
if (status != 0)
return status;
nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
+#ifdef CONFIG_NFS_V4_1
+ /* Is this a DS session */
+ if (data->fldata.ds_nfs_client) {
+ dprintk("%s DS write\n", __func__);
+ clp = data->fldata.ds_nfs_client;
+ }
+#endif /* CONFIG_NFS_V4_1 */
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
@@ -1169,7 +1239,7 @@ int nfs_writeback_done(struct rpc_task *
if (time_before(complain, jiffies)) {
dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
- server->nfs_client->cl_hostname,
+ clp->cl_hostname,
resp->verf->committed, argp->stable);
complain = jiffies + 300 * HZ;
}
@@ -1195,6 +1265,9 @@ int nfs_writeback_done(struct rpc_task *
*/
argp->stable = NFS_FILE_SYNC;
}
+#ifdef CONFIG_NFS_V4_1
+ data->pdata.pnfs_error = -EAGAIN;
+#endif /* CONFIG_NFS_V4_1 */
nfs_restart_rpc(task, server->nfs_client);
return -EAGAIN;
}
@@ -1239,40 +1312,75 @@ static void nfs_commitdata_release(void
nfs_commit_free(wdata);
}
-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static int nfs_commit_rpcsetup(struct list_head *head,
- struct nfs_write_data *data,
- int how)
+int nfs_initiate_commit(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how)
{
- struct nfs_page *first = nfs_list_entry(head->next);
- struct inode *inode = first->wb_context->path.dentry->d_inode;
+ struct inode *inode = data->inode;
int priority = flush_task_priority(how);
struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
- .rpc_cred = first->wb_context->cred,
+ .rpc_cred = data->cred,
};
struct rpc_task_setup task_setup_data = {
.task = &data->task,
- .rpc_client = NFS_CLIENT(inode),
+ .rpc_client = clnt,
.rpc_message = &msg,
- .callback_ops = &nfs_commit_ops,
+ .callback_ops = call_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC,
.priority = priority,
};
+ /* Set up the initial task struct. */
+ NFS_PROTO(inode)->commit_setup(data, &msg);
+
+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (how & FLUSH_SYNC)
+ rpc_wait_for_completion_task(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL(nfs_initiate_commit);
+
+
+int pnfs_initiate_commit(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how, int pnfs)
+{
+ if (pnfs &&
+ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED))
+ return pnfs_get_write_status(data);
+
+ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how);
+}
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_commit_rpcsetup(struct list_head *head,
+ struct nfs_write_data *data,
+ int how, int pnfs)
+{
+ struct nfs_page *first = nfs_list_entry(head->next);
+ struct inode *inode = first->wb_context->path.dentry->d_inode;
+
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
list_splice_init(head, &data->pages);
data->inode = inode;
- data->cred = msg.rpc_cred;
+ data->cred = first->wb_context->cred;
data->args.fh = NFS_FH(data->inode);
/* Note: we always request a commit of the entire inode */
@@ -1283,47 +1391,47 @@ static int nfs_commit_rpcsetup(struct li
data->res.fattr = &data->fattr;
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);
+ kref_init(&data->refcount);
+ data->parent = NULL;
+ data->args.context = first->wb_context; /* used by commit done */
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->commit_setup(data, &msg);
+ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops,
+ how, pnfs);
+}
- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+/* Handle memory error during commit */
+void nfs_mark_list_commit(struct list_head *head)
+{
+ struct nfs_page *req;
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- return PTR_ERR(task);
- if (how & FLUSH_SYNC)
- rpc_wait_for_completion_task(task);
- rpc_put_task(task);
- return 0;
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_mark_request_commit(req);
+ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
+ nfs_clear_page_tag_locked(req);
+ }
}
+EXPORT_SYMBOL(nfs_mark_list_commit);
/*
* Commit dirty pages
*/
static int
-nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs)
{
struct nfs_write_data *data;
- struct nfs_page *req;
data = nfs_commitdata_alloc();
-
if (!data)
goto out_bad;
/* Set up the argument struct */
- return nfs_commit_rpcsetup(head, data, how);
+ return nfs_commit_rpcsetup(head, data, how, pnfs);
out_bad:
- while (!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_mark_request_commit(req);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
- BDI_RECLAIMABLE);
- nfs_clear_page_tag_locked(req);
- }
+ nfs_mark_list_commit(head);
nfs_commit_clear_lock(NFS_I(inode));
return -ENOMEM;
}
@@ -1343,6 +1451,19 @@ static void nfs_commit_done(struct rpc_t
return;
}
+static inline void nfs_commit_cleanup(struct kref *kref)
+{
+ struct nfs_write_data *data;
+
+ data = container_of(kref, struct nfs_write_data, refcount);
+ /* Clear lock only when all cloned commits are finished */
+ if (data->parent)
+ kref_put(&data->parent->refcount, nfs_commit_cleanup);
+ else
+ nfs_commit_clear_lock(NFS_I(data->inode));
+ nfs_commitdata_release(data);
+}
+
static void nfs_commit_release(void *calldata)
{
struct nfs_write_data *data = calldata;
@@ -1360,6 +1481,11 @@ static void nfs_commit_release(void *cal
req->wb_bytes,
(long long)req_offset(req));
if (status < 0) {
+ if (req->wb_lseg) {
+ nfs_mark_request_nopnfs(req);
+ nfs_mark_request_dirty(req);
+ goto next;
+ }
nfs_context_set_write_error(req->wb_context, status);
nfs_inode_remove_request(req);
dprintk(", error = %d\n", status);
@@ -1376,12 +1502,12 @@ static void nfs_commit_release(void *cal
}
/* We have a mismatch. Write the page again */
dprintk(" mismatch\n");
+ nfs_mark_request_nopnfs(req);
nfs_mark_request_dirty(req);
next:
nfs_clear_page_tag_locked(req);
}
- nfs_commit_clear_lock(NFS_I(data->inode));
- nfs_commitdata_release(calldata);
+ kref_put(&data->refcount, nfs_commit_cleanup);
}
static const struct rpc_call_ops nfs_commit_ops = {
@@ -1397,21 +1523,22 @@ int nfs_commit_inode(struct inode *inode
LIST_HEAD(head);
int may_wait = how & FLUSH_SYNC;
int res = 0;
+ int use_pnfs = 0;
if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
goto out_mark_dirty;
spin_lock(&inode->i_lock);
- res = nfs_scan_commit(inode, &head, 0, 0);
+ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs);
spin_unlock(&inode->i_lock);
if (res) {
- int error = nfs_commit_list(inode, &head, how);
+ int error = nfs_commit_list(inode, &head, how, use_pnfs);
if (error < 0)
return error;
- if (may_wait)
+ if (may_wait) {
wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
nfs_wait_bit_killable,
TASK_KILLABLE);
- else
+ } else
goto out_mark_dirty;
} else
nfs_commit_clear_lock(NFS_I(inode));
@@ -1466,7 +1593,18 @@ static int nfs_commit_unstable_pages(str
int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- return nfs_commit_unstable_pages(inode, wbc);
+ int ret;
+ ret = nfs_commit_unstable_pages(inode, wbc);
+ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) {
+ int err, sync = wbc->sync_mode;
+
+ if (wbc->nonblocking || wbc->for_background)
+ sync = 0;
+ err = pnfs_layoutcommit_inode(inode, sync);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
}
/*
diff -up linux-2.6.38.noarch/include/linux/dcache.h.orig linux-2.6.38.noarch/include/linux/dcache.h
--- linux-2.6.38.noarch/include/linux/dcache.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/dcache.h 2011-03-26 07:57:44.307821030 -0400
@@ -169,6 +169,7 @@ struct dentry_operations {
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(struct dentry *, bool, bool);
+ void (*d_unlink)(struct dentry *, struct dentry *);
} ____cacheline_aligned;
/*
diff -up linux-2.6.38.noarch/include/linux/exportfs.h.orig linux-2.6.38.noarch/include/linux/exportfs.h
--- linux-2.6.38.noarch/include/linux/exportfs.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/exportfs.h 2011-03-26 07:57:44.309821012 -0400
@@ -2,6 +2,7 @@
#define LINUX_EXPORTFS_H 1
#include <linux/types.h>
+#include <linux/exp_xdr.h>
struct dentry;
struct inode;
@@ -188,4 +189,62 @@ extern struct dentry *generic_fh_to_pare
struct fid *fid, int fh_len, int fh_type,
struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
+#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
+struct pnfs_filelayout_device;
+struct pnfs_filelayout_layout;
+
+extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr,
+ const struct pnfs_filelayout_device *fdev);
+extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr,
+ const struct pnfs_filelayout_layout *flp);
+#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
+
+#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
+struct list_head;
+
+extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr,
+ const struct list_head *volumes);
+
+extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr,
+ const struct list_head *layouts);
+#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
+
+#if defined(CONFIG_PNFSD)
+#include <linux/module.h>
+
+struct pnfsd_cb_operations;
+
+struct pnfsd_cb_ctl {
+ spinlock_t lock;
+ struct module *module;
+ const struct pnfsd_cb_operations *cb_op;
+};
+
+/* in expfs.c so that file systems can depend on it */
+extern struct pnfsd_cb_ctl pnfsd_cb_ctl;
+
+static inline int
+pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl)
+{
+ int ret = -ENOENT;
+
+ spin_lock(&pnfsd_cb_ctl.lock);
+ if (!pnfsd_cb_ctl.cb_op)
+ goto out;
+ if (!try_module_get(pnfsd_cb_ctl.module))
+ goto out;
+ ctl->cb_op = pnfsd_cb_ctl.cb_op;
+ ctl->module = pnfsd_cb_ctl.module;
+ ret = 0;
+out:
+ spin_unlock(&pnfsd_cb_ctl.lock);
+ return ret;
+}
+
+static inline void
+pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl)
+{
+ module_put(ctl->module);
+}
+#endif /* CONFIG_PNFSD */
#endif /* LINUX_EXPORTFS_H */
diff -up linux-2.6.38.noarch/include/linux/exp_xdr.h.orig linux-2.6.38.noarch/include/linux/exp_xdr.h
--- linux-2.6.38.noarch/include/linux/exp_xdr.h.orig 2011-03-26 07:57:44.308821021 -0400
+++ linux-2.6.38.noarch/include/linux/exp_xdr.h 2011-03-26 07:57:44.308821021 -0400
@@ -0,0 +1,141 @@
+#ifndef _LINUX_EXP_XDR_H
+#define _LINUX_EXP_XDR_H
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/string.h>
+
+struct exp_xdr_stream {
+ __be32 *p;
+ __be32 *end;
+};
+
+/**
+ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes
+ * @nbytes: number of bytes to encode
+ */
+static inline size_t
+exp_xdr_qwords(__u32 nbytes)
+{
+ return DIV_ROUND_UP(nbytes, 4);
+}
+
+/**
+ * exp_xdr_qbytes - Calculate the number of bytes holding qwords
+ * @qwords: number of quad-words to encode
+ */
+static inline size_t
+exp_xdr_qbytes(size_t qwords)
+{
+ return qwords << 2;
+}
+
+/**
+ * exp_xdr_reserve_space - Reserve buffer space for sending
+ * @xdr: pointer to exp_xdr_stream
+ * @nbytes: number of bytes to reserve
+ *
+ * Checks that we have enough buffer space to encode 'nbytes' more
+ * bytes of data. If so, update the xdr stream.
+ */
+static inline __be32 *
+exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p = xdr->p;
+ __be32 *q;
+
+ /* align nbytes on the next 32-bit boundary */
+ q = p + exp_xdr_qwords(nbytes);
+ if (unlikely(q > xdr->end || q < p))
+ return NULL;
+ xdr->p = q;
+ return p;
+}
+
+/**
+ * exp_xdr_reserve_qwords - Reserve buffer space for sending
+ * @xdr: pointer to exp_xdr_stream
+ * @nwords: number of quad words (u32's) to reserve
+ */
+static inline __be32 *
+exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords)
+{
+ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords));
+}
+
+/**
+ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream
+ * @p: pointer to encoding destination
+ * @val: value to encode
+ */
+static inline __be32 *
+exp_xdr_encode_u32(__be32 *p, __u32 val)
+{
+ *p = cpu_to_be32(val);
+ return p + 1;
+}
+
+/**
+ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream
+ * @p: pointer to encoding destination
+ * @val: value to encode
+ */
+static inline __be32 *
+exp_xdr_encode_u64(__be32 *p, __u64 val)
+{
+ put_unaligned_be64(val, p);
+ return p + 2;
+}
+
+/**
+ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream
+ * @p: pointer to encoding destination
+ * @ptr: pointer to the array of bytes
+ * @nbytes: number of bytes to encode
+ */
+static inline __be32 *
+exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes)
+{
+ if (likely(nbytes != 0)) {
+ unsigned int qwords = exp_xdr_qwords(nbytes);
+ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes;
+
+ memcpy(p, ptr, nbytes);
+ if (padding != 0)
+ memset((char *)p + nbytes, 0, padding);
+ p += qwords;
+ }
+ return p;
+}
+
+/**
+ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream
+ * @p: pointer to encoding destination
+ * @ptr: pointer to the opaque array
+ * @nbytes: number of bytes to encode
+ *
+ * Encodes the 32-bit opaque size in bytes followed by the opaque value.
+ */
+static inline __be32 *
+exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes)
+{
+ p = exp_xdr_encode_u32(p, nbytes);
+ return exp_xdr_encode_bytes(p, ptr, nbytes);
+}
+
+/**
+ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream
+ * @lenp: pointer to the opaque length destination
+ * @endp: pointer to the end of the opaque array
+ *
+ * Encodes the 32-bit opaque size in bytes given the start and end pointers
+ */
+static inline __be32 *
+exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp)
+{
+ size_t nbytes = (char *)endp - (char *)(lenp + 1);
+
+ exp_xdr_encode_u32(lenp, nbytes);
+ return lenp + 1 + exp_xdr_qwords(nbytes);
+}
+#endif /* _LINUX_EXP_XDR_H */
diff -up linux-2.6.38.noarch/include/linux/fs.h.orig linux-2.6.38.noarch/include/linux/fs.h
--- linux-2.6.38.noarch/include/linux/fs.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/fs.h 2011-03-26 07:57:44.311820996 -0400
@@ -399,6 +399,7 @@ struct inodes_stat_t {
#include <asm/byteorder.h>
struct export_operations;
+struct pnfs_export_operations;
struct hd_geometry;
struct iovec;
struct nameidata;
@@ -1368,6 +1369,7 @@ struct super_block {
const struct dquot_operations *dq_op;
const struct quotactl_ops *s_qcop;
const struct export_operations *s_export_op;
+ const struct pnfs_export_operations *s_pnfs_op;
unsigned long s_flags;
unsigned long s_magic;
struct dentry *s_root;
diff -up linux-2.6.38.noarch/include/linux/nfs4.h.orig linux-2.6.38.noarch/include/linux/nfs4.h
--- linux-2.6.38.noarch/include/linux/nfs4.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfs4.h 2011-03-26 07:57:44.313820980 -0400
@@ -17,6 +17,7 @@
#define NFS4_BITMAP_SIZE 2
#define NFS4_VERIFIER_SIZE 8
+#define NFS4_CLIENTID_SIZE 8
#define NFS4_STATEID_SEQID_SIZE 4
#define NFS4_STATEID_OTHER_SIZE 12
#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE)
@@ -131,6 +132,13 @@
#define EXCHGID4_FLAG_MASK_A 0x40070103
#define EXCHGID4_FLAG_MASK_R 0x80070103
+static inline bool
+is_ds_only_session(u32 exchange_flags)
+{
+ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS;
+ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
#define SEQ4_STATUS_CB_PATH_DOWN 0x00000001
#define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002
#define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004
@@ -181,7 +189,13 @@ struct nfs4_acl {
struct nfs4_ace aces[0];
};
+struct nfs4_fsid {
+ u64 major;
+ u64 minor;
+};
+
typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
+typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid;
struct nfs41_stateid {
__be32 seqid;
@@ -559,7 +573,12 @@ enum {
NFSPROC4_CLNT_GET_LEASE_TIME,
NFSPROC4_CLNT_RECLAIM_COMPLETE,
NFSPROC4_CLNT_LAYOUTGET,
+ NFSPROC4_CLNT_LAYOUTCOMMIT,
+ NFSPROC4_CLNT_LAYOUTRETURN,
+ NFSPROC4_CLNT_GETDEVICELIST,
NFSPROC4_CLNT_GETDEVICEINFO,
+ NFSPROC4_CLNT_PNFS_WRITE,
+ NFSPROC4_CLNT_PNFS_COMMIT,
};
/* nfs41 types */
@@ -582,6 +601,8 @@ enum pnfs_layouttype {
LAYOUT_NFSV4_1_FILES = 1,
LAYOUT_OSD2_OBJECTS = 2,
LAYOUT_BLOCK_VOLUME = 3,
+
+ NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000
};
/* used for both layout return and recall */
diff -up linux-2.6.38.noarch/include/linux/nfsd4_block.h.orig linux-2.6.38.noarch/include/linux/nfsd4_block.h
--- linux-2.6.38.noarch/include/linux/nfsd4_block.h.orig 2011-03-26 07:57:44.326820870 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd4_block.h 2011-03-26 07:57:44.326820870 -0400
@@ -0,0 +1,101 @@
+#ifndef NFSD4_BLOCK
+#define NFSD4_BLOCK
+
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/nfsd/nfsfh.h>
+#include <linux/nfsd/nfsd4_pnfs.h>
+
+#define PNFS_BLOCK_SUCCESS 1
+#define PNFS_BLOCK_FAILURE 0
+
+#define PNFS_BLOCK_CTL_START 1
+#define PNFS_BLOCK_CTL_STOP 2
+#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current
+ * version from kernel via an upcall.
+ */
+
+#define PNFS_UPCALL_MSG_STOP 0
+#define PNFS_UPCALL_MSG_GETSIG 1
+#define PNFS_UPCALL_MSG_GETSLICE 2
+#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume
+#define PNFS_UPCALL_MSG_DMGET 4
+#define PNFS_UPCALL_MSG_VERS 5
+
+#define PNFS_UPCALL_VERS 8
+
+typedef struct stripe_dev {
+ int major,
+ minor,
+ offset;
+} stripe_dev_t;
+
+typedef struct bl_comm_res {
+ int res_status;
+ union {
+ struct {
+ long long start,
+ length;
+ } slice;
+ struct {
+ int num_stripes,
+ stripe_size;
+ stripe_dev_t devs[];
+ } stripe;
+ struct {
+ long long sector;
+ int offset,
+ len;
+ char sig[];
+ } sig;
+ int vers,
+ dm_vol;
+ } u;
+} bl_comm_res_t;
+
+typedef struct bl_comm_msg {
+ int msg_type,
+ msg_status;
+ union {
+ dev_t msg_dev;
+ int msg_vers;
+ } u;
+ bl_comm_res_t *msg_res;
+} bl_comm_msg_t;
+
+#ifdef __KERNEL__
+
+typedef struct bl_comm {
+ /* ---- protects access to this structure ---- */
+ struct mutex lock;
+ /* ---- protects access to rpc pipe ---- */
+ struct mutex pipe_lock;
+ struct dentry *pipe_dentry;
+ wait_queue_head_t pipe_wq;
+ bl_comm_msg_t msg;
+} bl_comm_t;
+
+int pnfs_block_enabled(struct inode *, int);
+int bl_layout_type(struct super_block *sb);
+int bl_getdeviceiter(struct super_block *, u32 layout_type,
+ struct nfsd4_pnfs_dev_iter_res *);
+int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
+ u32 layout_type,
+ const struct nfsd4_pnfs_deviceid *);
+enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *,
+ const struct nfsd4_pnfs_layoutget_arg *,
+ struct nfsd4_pnfs_layoutget_res *);
+int bl_layoutcommit(struct inode *,
+ const struct nfsd4_pnfs_layoutcommit_arg *,
+ struct nfsd4_pnfs_layoutcommit_res *);
+int bl_layoutreturn(struct inode *,
+ const struct nfsd4_pnfs_layoutreturn_arg *);
+int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len);
+int bl_init_proc(void);
+int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **);
+
+extern bl_comm_t *bl_comm_global; // Ugly...
+#endif /* __KERNEL__ */
+
+#endif /* NFSD4_BLOCK */
+
diff -up linux-2.6.38.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.38.noarch/include/linux/nfsd4_spnfs.h
--- linux-2.6.38.noarch/include/linux/nfsd4_spnfs.h.orig 2011-03-26 07:57:44.328820852 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd4_spnfs.h 2011-03-26 07:57:44.328820852 -0400
@@ -0,0 +1,345 @@
+/*
+ * include/linux/nfsd4_spnfs.h
+ *
+ * spNFS - simple pNFS implementation with userspace daemon
+ *
+ */
+
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc. All Rights Reserved.
+
+Network Appliance provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+#ifndef NFS_SPNFS_H
+#define NFS_SPNFS_H
+
+
+#ifdef __KERNEL__
+#include "exportfs.h"
+#include "sunrpc/svc.h"
+#include "nfsd/nfsfh.h"
+#else
+#include <sys/types.h>
+#endif /* __KERNEL__ */
+
+#define SPNFS_STATUS_INVALIDMSG 0x01
+#define SPNFS_STATUS_AGAIN 0x02
+#define SPNFS_STATUS_FAIL 0x04
+#define SPNFS_STATUS_SUCCESS 0x08
+
+#define SPNFS_TYPE_LAYOUTGET 0x01
+#define SPNFS_TYPE_LAYOUTCOMMIT 0x02
+#define SPNFS_TYPE_LAYOUTRETURN 0x03
+#define SPNFS_TYPE_GETDEVICEITER 0x04
+#define SPNFS_TYPE_GETDEVICEINFO 0x05
+#define SPNFS_TYPE_SETATTR 0x06
+#define SPNFS_TYPE_OPEN 0x07
+#define SPNFS_TYPE_CLOSE 0x08
+#define SPNFS_TYPE_CREATE 0x09
+#define SPNFS_TYPE_REMOVE 0x0a
+#define SPNFS_TYPE_COMMIT 0x0b
+#define SPNFS_TYPE_READ 0x0c
+#define SPNFS_TYPE_WRITE 0x0d
+
+#define SPNFS_MAX_DEVICES 1
+#define SPNFS_MAX_DATA_SERVERS 16
+#define SPNFS_MAX_IO 512
+
+/* layout */
+struct spnfs_msg_layoutget_args {
+ unsigned long inode;
+ unsigned long generation;
+};
+
+struct spnfs_filelayout_list {
+ u_int32_t fh_len;
+ unsigned char fh_val[128]; /* DMXXX fix this const */
+};
+
+struct spnfs_msg_layoutget_res {
+ int status;
+ u_int64_t devid;
+ u_int64_t stripe_size;
+ u_int32_t stripe_type;
+ u_int32_t stripe_count;
+ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS];
+};
+
+/* layoutcommit */
+struct spnfs_msg_layoutcommit_args {
+ unsigned long inode;
+ unsigned long generation;
+ u_int64_t file_size;
+};
+
+struct spnfs_msg_layoutcommit_res {
+ int status;
+};
+
+/* layoutreturn */
+/* No op for the daemon */
+/*
+struct spnfs_msg_layoutreturn_args {
+};
+
+struct spnfs_msg_layoutreturn_res {
+};
+*/
+
+/* getdeviceiter */
+struct spnfs_msg_getdeviceiter_args {
+ unsigned long inode;
+ u_int64_t cookie;
+ u_int64_t verf;
+};
+
+struct spnfs_msg_getdeviceiter_res {
+ int status;
+ u_int64_t devid;
+ u_int64_t cookie;
+ u_int64_t verf;
+ u_int32_t eof;
+};
+
+/* getdeviceinfo */
+struct spnfs_data_server {
+ u_int32_t dsid;
+ char netid[5];
+ char addr[29];
+};
+
+struct spnfs_device {
+ u_int64_t devid;
+ int dscount;
+ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS];
+};
+
+struct spnfs_msg_getdeviceinfo_args {
+ u_int64_t devid;
+};
+
+struct spnfs_msg_getdeviceinfo_res {
+ int status;
+ struct spnfs_device devinfo;
+};
+
+/* setattr */
+struct spnfs_msg_setattr_args {
+ unsigned long inode;
+ unsigned long generation;
+ int file_size;
+};
+
+struct spnfs_msg_setattr_res {
+ int status;
+};
+
+/* open */
+struct spnfs_msg_open_args {
+ unsigned long inode;
+ unsigned long generation;
+ int create;
+ int createmode;
+ int truncate;
+};
+
+struct spnfs_msg_open_res {
+ int status;
+};
+
+/* close */
+/* No op for daemon */
+struct spnfs_msg_close_args {
+ int x;
+};
+
+struct spnfs_msg_close_res {
+ int y;
+};
+
+/* create */
+/*
+struct spnfs_msg_create_args {
+ int x;
+};
+
+struct spnfs_msg_create_res {
+ int y;
+};
+*/
+
+/* remove */
+struct spnfs_msg_remove_args {
+ unsigned long inode;
+ unsigned long generation;
+};
+
+struct spnfs_msg_remove_res {
+ int status;
+};
+
+/* commit */
+/*
+struct spnfs_msg_commit_args {
+ int x;
+};
+
+struct spnfs_msg_commit_res {
+ int y;
+};
+*/
+
+/* read */
+struct spnfs_msg_read_args {
+ unsigned long inode;
+ unsigned long generation;
+ loff_t offset;
+ unsigned long len;
+};
+
+struct spnfs_msg_read_res {
+ int status;
+ char data[SPNFS_MAX_IO];
+};
+
+/* write */
+struct spnfs_msg_write_args {
+ unsigned long inode;
+ unsigned long generation;
+ loff_t offset;
+ unsigned long len;
+ char data[SPNFS_MAX_IO];
+};
+
+struct spnfs_msg_write_res {
+ int status;
+};
+
+/* bundle args and responses */
+union spnfs_msg_args {
+ struct spnfs_msg_layoutget_args layoutget_args;
+ struct spnfs_msg_layoutcommit_args layoutcommit_args;
+/*
+ struct spnfs_msg_layoutreturn_args layoutreturn_args;
+*/
+ struct spnfs_msg_getdeviceiter_args getdeviceiter_args;
+ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args;
+ struct spnfs_msg_setattr_args setattr_args;
+ struct spnfs_msg_open_args open_args;
+ struct spnfs_msg_close_args close_args;
+/*
+ struct spnfs_msg_create_args create_args;
+*/
+ struct spnfs_msg_remove_args remove_args;
+/*
+ struct spnfs_msg_commit_args commit_args;
+*/
+ struct spnfs_msg_read_args read_args;
+ struct spnfs_msg_write_args write_args;
+};
+
+union spnfs_msg_res {
+ struct spnfs_msg_layoutget_res layoutget_res;
+ struct spnfs_msg_layoutcommit_res layoutcommit_res;
+/*
+ struct spnfs_msg_layoutreturn_res layoutreturn_res;
+*/
+ struct spnfs_msg_getdeviceiter_res getdeviceiter_res;
+ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res;
+ struct spnfs_msg_setattr_res setattr_res;
+ struct spnfs_msg_open_res open_res;
+ struct spnfs_msg_close_res close_res;
+/*
+ struct spnfs_msg_create_res create_res;
+*/
+ struct spnfs_msg_remove_res remove_res;
+/*
+ struct spnfs_msg_commit_res commit_res;
+*/
+ struct spnfs_msg_read_res read_res;
+ struct spnfs_msg_write_res write_res;
+};
+
+/* a spnfs message, args and response */
+struct spnfs_msg {
+ unsigned char im_type;
+ unsigned char im_status;
+ union spnfs_msg_args im_args;
+ union spnfs_msg_res im_res;
+};
+
+/* spnfs configuration info */
+struct spnfs_config {
+ unsigned char dense_striping;
+ int stripe_size;
+ int num_ds;
+ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */
+};
+
+#if defined(__KERNEL__) && defined(CONFIG_SPNFS)
+
+#include <linux/nfsd/nfsd4_pnfs.h>
+
+/* pipe mgmt structure. messages flow through here */
+struct spnfs {
+ struct dentry *spnfs_dentry; /* dentry for pipe */
+ wait_queue_head_t spnfs_wq;
+ struct spnfs_msg spnfs_im; /* spnfs message */
+ struct mutex spnfs_lock; /* Serializes upcalls */
+ struct mutex spnfs_plock;
+};
+
+struct nfsd4_open;
+
+int spnfs_layout_type(struct super_block *);
+enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_layoutget_arg *,
+ struct nfsd4_pnfs_layoutget_res *);
+int spnfs_layoutcommit(void);
+int spnfs_layoutreturn(struct inode *,
+ const struct nfsd4_pnfs_layoutreturn_arg *);
+int spnfs_getdeviceiter(struct super_block *,
+ u32 layout_type,
+ struct nfsd4_pnfs_dev_iter_res *);
+int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
+ u32 layout_type,
+ const struct nfsd4_pnfs_deviceid *);
+int spnfs_setattr(void);
+int spnfs_open(struct inode *, struct nfsd4_open *);
+int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *);
+int spnfs_remove(unsigned long, unsigned long);
+__be32 spnfs_read(struct inode *, loff_t, unsigned long *,
+ int, struct svc_rqst *);
+__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *);
+int spnfs_getfh(int, struct nfs_fh *);
+int spnfs_test_layoutrecall(char *, u64, u64);
+int spnfs_layoutrecall(struct inode *, int, u64, u64);
+
+int nfsd_spnfs_new(void);
+void nfsd_spnfs_delete(void);
+int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *);
+int spnfs_enabled(void);
+int spnfs_init_proc(void);
+
+extern struct spnfs_config *spnfs_config;
+
+#endif /* __KERNEL__ && CONFIG_SPNFS */
+
+#endif /* NFS_SPNFS_H */
diff -up linux-2.6.38.noarch/include/linux/nfsd/const.h.orig linux-2.6.38.noarch/include/linux/nfsd/const.h
--- linux-2.6.38.noarch/include/linux/nfsd/const.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd/const.h 2011-03-26 07:57:44.321820912 -0400
@@ -29,6 +29,7 @@
#ifdef __KERNEL__
#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/svc.h>
/*
* Largest number of bytes we need to allocate for an NFS
diff -up linux-2.6.38.noarch/include/linux/nfsd/debug.h.orig linux-2.6.38.noarch/include/linux/nfsd/debug.h
--- linux-2.6.38.noarch/include/linux/nfsd/debug.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd/debug.h 2011-03-26 07:57:44.322820904 -0400
@@ -32,6 +32,8 @@
#define NFSDDBG_REPCACHE 0x0080
#define NFSDDBG_XDR 0x0100
#define NFSDDBG_LOCKD 0x0200
+#define NFSDDBG_PNFS 0x0400
+#define NFSDDBG_FILELAYOUT 0x0800
#define NFSDDBG_ALL 0x7FFF
#define NFSDDBG_NOCHANGE 0xFFFF
diff -up linux-2.6.38.noarch/include/linux/nfsd/export.h.orig linux-2.6.38.noarch/include/linux/nfsd/export.h
--- linux-2.6.38.noarch/include/linux/nfsd/export.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd/export.h 2011-03-26 07:57:44.322820904 -0400
@@ -79,6 +79,20 @@ struct nfsd4_fs_locations {
};
/*
+ * Callbacks
+ */
+struct nfsd4_callback {
+ void *cb_op;
+ struct nfs4_client *cb_clp;
+ struct list_head cb_per_client;
+ u32 cb_minorversion;
+ struct rpc_message cb_msg;
+ const struct rpc_call_ops *cb_ops;
+ struct work_struct cb_work;
+ bool cb_done;
+};
+
+/*
* We keep an array of pseudoflavors with the export, in order from most
* to least preferred. For the forseeable future, we don't expect more
* than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
@@ -100,6 +114,7 @@ struct svc_export {
uid_t ex_anon_uid;
gid_t ex_anon_gid;
int ex_fsid;
+ int ex_pnfs;
unsigned char * ex_uuid; /* 16 byte fsid */
struct nfsd4_fs_locations ex_fslocs;
int ex_nflavors;
diff -up linux-2.6.38.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.38.noarch/include/linux/nfsd/nfs4layoutxdr.h
--- linux-2.6.38.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2011-03-26 07:57:44.323820896 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd/nfs4layoutxdr.h 2011-03-26 07:57:44.323820896 -0400
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef NFSD_NFS4LAYOUTXDR_H
+#define NFSD_NFS4LAYOUTXDR_H
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/nfsd/nfsd4_pnfs.h>
+
+/* the nfsd4_pnfs_devlist dev_addr for the file layout type */
+struct pnfs_filelayout_devaddr {
+ struct xdr_netobj r_netid;
+ struct xdr_netobj r_addr;
+};
+
+/* list of multipath servers */
+struct pnfs_filelayout_multipath {
+ u32 fl_multipath_length;
+ struct pnfs_filelayout_devaddr *fl_multipath_list;
+};
+
+struct pnfs_filelayout_device {
+ u32 fl_stripeindices_length;
+ u32 *fl_stripeindices_list;
+ u32 fl_device_length;
+ struct pnfs_filelayout_multipath *fl_device_list;
+};
+
+struct pnfs_filelayout_layout {
+ u32 lg_layout_type; /* response */
+ u32 lg_stripe_type; /* response */
+ u32 lg_commit_through_mds; /* response */
+ u64 lg_stripe_unit; /* response */
+ u64 lg_pattern_offset; /* response */
+ u32 lg_first_stripe_index; /* response */
+ struct nfsd4_pnfs_deviceid device_id; /* response */
+ u32 lg_fh_length; /* response */
+ struct knfsd_fh *lg_fh_list; /* response */
+};
+
+enum stripetype4 {
+ STRIPE_SPARSE = 1,
+ STRIPE_DENSE = 2
+};
+
+enum pnfs_block_extent_state4 {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2,
+ PNFS_BLOCK_NONE_DATA = 3
+};
+
+enum pnfs_block_volume_type4 {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0,
+ PNFS_BLOCK_VOLUME_SLICE = 1,
+ PNFS_BLOCK_VOLUME_CONCAT = 2,
+ PNFS_BLOCK_VOLUME_STRIPE = 3,
+};
+typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4;
+
+enum bl_cache_state {
+ BLOCK_LAYOUT_NEW = 0,
+ BLOCK_LAYOUT_CACHE = 1,
+ BLOCK_LAYOUT_UPDATE = 2,
+};
+
+typedef struct pnfs_blocklayout_layout {
+ struct list_head bll_list;
+ struct nfsd4_pnfs_deviceid bll_vol_id;
+ u64 bll_foff; // file offset
+ u64 bll_len;
+ u64 bll_soff; // storage offset
+ int bll_recalled;
+ enum pnfs_block_extent_state4 bll_es;
+ enum bl_cache_state bll_cache_state;
+} pnfs_blocklayout_layout_t;
+
+typedef struct pnfs_blocklayout_devinfo {
+ struct list_head bld_list;
+ pnfs_block_volume_type4 bld_type;
+ struct nfsd4_pnfs_deviceid bld_devid;
+ int bld_index_loc;
+ union {
+ struct {
+ u64 bld_offset;
+ u32 bld_sig_len,
+ *bld_sig;
+ } simple;
+ struct {
+ u64 bld_start,
+ bld_len;
+ u32 bld_index; /* Index of Simple Volume */
+ } slice;
+ struct {
+ u32 bld_stripes;
+ u64 bld_chunk_size;
+ u32 *bld_stripe_indexs;
+ } stripe;
+ } u;
+} pnfs_blocklayout_devinfo_t;
+
+#endif /* NFSD_NFS4LAYOUTXDR_H */
diff -up linux-2.6.38.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.38.noarch/include/linux/nfsd/nfs4pnfsdlm.h
--- linux-2.6.38.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2011-03-26 07:57:44.323820896 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2011-03-26 07:57:44.323820896 -0400
@@ -0,0 +1,54 @@
+/******************************************************************************
+ *
+ * (c) 2007 Network Appliance, Inc. All Rights Reserved.
+ * (c) 2009 NetApp. All Rights Reserved.
+ *
+ * NetApp provides this source code under the GPL v2 License.
+ * The GPL v2 license is available at
+ * http://opensource.org/licenses/gpl-license.php.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include <linux/genhd.h>
+
+/*
+ * Length of comma separated pnfs data server IPv4 addresses. Enough room for
+ * 32 addresses.
+ */
+#define NFSD_DLM_DS_LIST_MAX 512
+/*
+ * Length of colon separated pnfs dlm device of the form
+ * disk_name:comma separated data server IPv4 address
+ */
+#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1)
+
+#ifdef CONFIG_PNFSD
+
+/* For use by DLM cluster file systems exported by pNFSD */
+extern const struct pnfs_export_operations pnfs_dlm_export_ops;
+
+int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len);
+
+void nfsd4_pnfs_dlm_shutdown(void);
+
+ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen);
+
+#else /* CONFIG_PNFSD */
+
+static inline void nfsd4_pnfs_dlm_shutdown(void)
+{
+ return;
+}
+
+#endif /* CONFIG_PNFSD */
diff -up linux-2.6.38.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.38.noarch/include/linux/nfsd/nfsd4_pnfs.h
--- linux-2.6.38.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2011-03-26 07:57:44.324820888 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd/nfsd4_pnfs.h 2011-03-26 07:57:44.324820888 -0400
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _LINUX_NFSD_NFSD4_PNFS_H
+#define _LINUX_NFSD_NFSD4_PNFS_H
+
+#include <linux/exportfs.h>
+#include <linux/exp_xdr.h>
+#include <linux/nfs_xdr.h>
+#include <linux/nfsd/export.h>
+
+struct nfsd4_pnfs_deviceid {
+ u64 sbid; /* per-superblock unique ID */
+ u64 devid; /* filesystem-wide unique device ID */
+};
+
+struct nfsd4_pnfs_dev_iter_res {
+ u64 gd_cookie; /* request/repsonse */
+ u64 gd_verf; /* request/repsonse */
+ u64 gd_devid; /* response */
+ u32 gd_eof; /* response */
+};
+
+/* Arguments for set_device_notify */
+struct pnfs_devnotify_arg {
+ struct nfsd4_pnfs_deviceid dn_devid; /* request */
+ u32 dn_layout_type; /* request */
+ u32 dn_notify_types; /* request/response */
+};
+
+struct nfsd4_layout_seg {
+ u64 clientid;
+ u32 layout_type;
+ u32 iomode;
+ u64 offset;
+ u64 length;
+};
+
+/* Used by layout_get to encode layout (loc_body var in spec)
+ * Args:
+ * minlength - min number of accessible bytes given by layout
+ * fsid - Major part of struct pnfs_deviceid. File system uses this
+ * to build the deviceid returned in the layout.
+ * fh - fs can modify the file handle for use on data servers
+ * seg - layout info requested and layout info returned
+ * xdr - xdr info
+ * return_on_close - true if layout to be returned on file close
+ */
+
+struct nfsd4_pnfs_layoutget_arg {
+ u64 lg_minlength;
+ u64 lg_sbid;
+ const struct knfsd_fh *lg_fh;
+};
+
+struct nfsd4_pnfs_layoutget_res {
+ struct nfsd4_layout_seg lg_seg; /* request/resopnse */
+ u32 lg_return_on_close;
+};
+
+struct nfsd4_pnfs_layoutcommit_arg {
+ struct nfsd4_layout_seg lc_seg; /* request */
+ u32 lc_reclaim; /* request */
+ u32 lc_newoffset; /* request */
+ u64 lc_last_wr; /* request */
+ struct nfstime4 lc_mtime; /* request */
+ u32 lc_up_len; /* layout length */
+ void *lc_up_layout; /* decoded by callback */
+};
+
+struct nfsd4_pnfs_layoutcommit_res {
+ u32 lc_size_chg; /* boolean for response */
+ u64 lc_newsize; /* response */
+};
+
+#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */
+
+struct nfsd4_pnfs_layoutreturn_arg {
+ u32 lr_return_type; /* request */
+ struct nfsd4_layout_seg lr_seg; /* request */
+ u32 lr_reclaim; /* request */
+ u32 lrf_body_len; /* request */
+ void *lrf_body; /* request */
+ void *lr_cookie; /* fs private */
+};
+
+/* pNFS Metadata to Data server state communication */
+struct pnfs_get_state {
+ u32 dsid; /* request */
+ u64 ino; /* request */
+ nfs4_stateid stid; /* request;response */
+ nfs4_clientid clid; /* response */
+ u32 access; /* response */
+ u32 stid_gen; /* response */
+ u32 verifier[2]; /* response */
+};
+
+/*
+ * pNFS export operations vector.
+ *
+ * The filesystem must implement the following methods:
+ * layout_type
+ * get_device_info
+ * layout_get
+ *
+ * All other methods are optional and can be set to NULL if not implemented.
+ */
+struct pnfs_export_operations {
+ /* Returns the supported pnfs_layouttype4. */
+ int (*layout_type) (struct super_block *);
+
+ /* Encode device info onto the xdr stream. */
+ int (*get_device_info) (struct super_block *,
+ struct exp_xdr_stream *,
+ u32 layout_type,
+ const struct nfsd4_pnfs_deviceid *);
+
+ /* Retrieve all available devices via an iterator.
+ * arg->cookie == 0 indicates the beginning of the list,
+ * otherwise arg->verf is used to verify that the list hasn't changed
+ * while retrieved.
+ *
+ * On output, the filesystem sets the devid based on the current cookie
+ * and sets res->cookie and res->verf corresponding to the next entry.
+ * When the last entry in the list is retrieved, res->eof is set to 1.
+ */
+ int (*get_device_iter) (struct super_block *,
+ u32 layout_type,
+ struct nfsd4_pnfs_dev_iter_res *);
+
+ int (*set_device_notify) (struct super_block *,
+ struct pnfs_devnotify_arg *);
+
+ /* Retrieve and encode a layout for inode onto the xdr stream.
+ * arg->minlength is the minimum number of accessible bytes required
+ * by the client.
+ * The maximum number of bytes to encode the layout is given by
+ * the xdr stream end pointer.
+ * arg->fsid contains the major part of struct pnfs_deviceid.
+ * The file system uses this to build the deviceid returned
+ * in the layout.
+ * res->seg - layout segment requested and layout info returned.
+ * res->fh can be modified the file handle for use on data servers
+ * res->return_on_close - true if layout to be returned on file close
+ *
+ * return one of the following nfs errors:
+ * NFS_OK Success
+ * NFS4ERR_ACCESS Permission error
+ * NFS4ERR_BADIOMODE Server does not support requested iomode
+ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules
+ * NFS4ERR_INVAL Parameter other than layout is invalid
+ * NFS4ERR_IO I/O error
+ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later
+ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file
+ * NFS4ERR_LOCKED Lock conflict
+ * NFS4ERR_NOSPC Out-of-space error occured
+ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to
+ * a conflicting CB_LAYOUTRECALL
+ * NFS4ERR_SERVERFAULT Server went bezerk
+ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout
+ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file)
+ */
+ enum nfsstat4 (*layout_get) (struct inode *,
+ struct exp_xdr_stream *xdr,
+ const struct nfsd4_pnfs_layoutget_arg *,
+ struct nfsd4_pnfs_layoutget_res *);
+
+ /* Commit changes to layout */
+ int (*layout_commit) (struct inode *,
+ const struct nfsd4_pnfs_layoutcommit_arg *,
+ struct nfsd4_pnfs_layoutcommit_res *);
+
+ /* Returns the layout */
+ int (*layout_return) (struct inode *,
+ const struct nfsd4_pnfs_layoutreturn_arg *);
+
+ /* Can layout segments be merged for this layout type? */
+ int (*can_merge_layouts) (u32 layout_type);
+
+ /* pNFS Files layout specific operations */
+
+ /* Get the write verifier for DS (called on MDS only) */
+ void (*get_verifier) (struct super_block *, u32 *p);
+ /* Call fs on DS only */
+ int (*get_state) (struct inode *, struct knfsd_fh *,
+ struct pnfs_get_state *);
+};
+
+struct nfsd4_pnfs_cb_layout {
+ u32 cbl_recall_type; /* request */
+ struct nfsd4_layout_seg cbl_seg; /* request */
+ u32 cbl_layoutchanged; /* request */
+ nfs4_stateid cbl_sid; /* request */
+ struct nfs4_fsid cbl_fsid;
+ void *cbl_cookie; /* fs private */
+};
+
+/* layoutrecall request (from exported filesystem) */
+struct nfs4_layoutrecall {
+ struct kref clr_ref;
+ struct nfsd4_pnfs_cb_layout cb; /* request */
+ struct list_head clr_perclnt; /* on cl_layoutrecalls */
+ struct nfs4_client *clr_client;
+ struct nfs4_file *clr_file;
+ struct timespec clr_time; /* last activity */
+ struct super_block *clr_sb; /* We might not have a file */
+ struct nfs4_layoutrecall *parent; /* The initiating recall */
+
+ /* nfsd internal */
+ struct nfsd4_callback clr_recall;
+};
+
+struct nfsd4_pnfs_cb_dev_item {
+ u32 cbd_notify_type; /* request */
+ u32 cbd_layout_type; /* request */
+ struct nfsd4_pnfs_deviceid cbd_devid; /* request */
+ u32 cbd_immediate; /* request */
+};
+
+struct nfsd4_pnfs_cb_dev_list {
+ u32 cbd_len; /* request */
+ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */
+};
+
+/*
+ * callbacks provided by the nfsd
+ */
+struct pnfsd_cb_operations {
+ /* Generic callbacks */
+ int (*cb_layout_recall) (struct super_block *, struct inode *,
+ struct nfsd4_pnfs_cb_layout *);
+ int (*cb_device_notify) (struct super_block *,
+ struct nfsd4_pnfs_cb_dev_list *);
+
+ /* pNFS Files layout specific callbacks */
+
+ /* Callback from fs on MDS only */
+ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *);
+ /* Callback from fs on DS only */
+ int (*cb_change_state) (struct pnfs_get_state *);
+};
+
+#endif /* _LINUX_NFSD_NFSD4_PNFS_H */
diff -up linux-2.6.38.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.38.noarch/include/linux/nfsd/syscall.h
--- linux-2.6.38.noarch/include/linux/nfsd/syscall.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfsd/syscall.h 2011-03-26 07:57:44.325820879 -0400
@@ -29,6 +29,7 @@
/*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */
#define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */
#define NFSCTL_GETFS 8 /* get an fh by path with max FH len */
+#define NFSCTL_FD2FH 9 /* get a fh from a fd */
/* SVC */
struct nfsctl_svc {
@@ -71,6 +72,11 @@ struct nfsctl_fsparm {
int gd_maxlen;
};
+/* FD2FH */
+struct nfsctl_fd2fh {
+ int fd;
+};
+
/*
* This is the argument union.
*/
@@ -82,6 +88,7 @@ struct nfsctl_arg {
struct nfsctl_export u_export;
struct nfsctl_fdparm u_getfd;
struct nfsctl_fsparm u_getfs;
+ struct nfsctl_fd2fh u_fd2fh;
/*
* The following dummy member is needed to preserve binary compatibility
* on platforms where alignof(void*)>alignof(int). It's needed because
@@ -95,6 +102,7 @@ struct nfsctl_arg {
#define ca_export u.u_export
#define ca_getfd u.u_getfd
#define ca_getfs u.u_getfs
+#define ca_fd2fh u.u_fd2fh
};
union nfsctl_res {
diff -up linux-2.6.38.noarch/include/linux/nfs_fs.h.orig linux-2.6.38.noarch/include/linux/nfs_fs.h
--- linux-2.6.38.noarch/include/linux/nfs_fs.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfs_fs.h 2011-03-26 07:57:44.315820964 -0400
@@ -488,7 +488,7 @@ extern void nfs_release_automount_timer(
/*
* linux/fs/nfs/unlink.c
*/
-extern void nfs_complete_unlink(struct dentry *dentry, struct inode *);
+extern void nfs_complete_unlink(struct dentry *dentry, struct dentry *);
extern void nfs_block_sillyrename(struct dentry *dentry);
extern void nfs_unblock_sillyrename(struct dentry *dentry);
extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
@@ -499,8 +499,12 @@ extern int nfs_sillyrename(struct inode
extern int nfs_congestion_kb;
extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
extern int nfs_writepages(struct address_space *, struct writeback_control *);
-extern int nfs_flush_incompatible(struct file *file, struct page *page);
-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
+struct pnfs_layout_segment;
+extern int nfs_flush_incompatible(struct file *file, struct page *page,
+ struct pnfs_layout_segment *lseg);
+extern int nfs_updatepage(struct file *, struct page *,
+ unsigned int offset, unsigned int count,
+ struct pnfs_layout_segment *lseg, void *fsdata);
extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
/*
diff -up linux-2.6.38.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.38.noarch/include/linux/nfs_fs_sb.h
--- linux-2.6.38.noarch/include/linux/nfs_fs_sb.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfs_fs_sb.h 2011-03-26 07:57:44.316820955 -0400
@@ -30,6 +30,8 @@ struct nfs_client {
#define NFS_CS_CALLBACK 1 /* - callback started */
#define NFS_CS_IDMAP 2 /* - idmap started */
#define NFS_CS_RENEWD 3 /* - renewd started */
+#define NFS_CS_STOP_RENEW 4 /* no more state to renew */
+#define NFS_CS_CHECK_LEASE_TIME 5 /* need to check lease time */
struct sockaddr_storage cl_addr; /* server identifier */
size_t cl_addrlen;
char * cl_hostname; /* hostname of server */
@@ -83,6 +85,16 @@ struct nfs_client {
#endif
};
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+ return is_ds_only_session(clp->cl_exchange_flags);
+#else
+ return false;
+#endif
+}
+
/*
* NFS client parameters stored in the superblock.
*/
@@ -128,7 +140,7 @@ struct nfs_server {
#endif
#ifdef CONFIG_NFS_V4
- u32 attr_bitmask[2];/* V4 bitmask representing the set
+ u32 attr_bitmask[3];/* V4 bitmask representing the set
of attributes supported on this
filesystem */
u32 cache_consistency_bitmask[2];
@@ -140,7 +152,11 @@ struct nfs_server {
that are supported on this
filesystem */
struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
- struct rpc_wait_queue roc_rpcwaitq;
+ struct rpc_wait_queue roc_rpcwaitq;
+ void *pnfs_ld_data; /* Per-mount data */
+ unsigned int ds_rsize; /* Data server read size */
+ unsigned int ds_wsize; /* Data server write size */
+ u32 pnfs_blksize; /* layout_blksize attr */
/* the following fields are protected by nfs_client->cl_lock */
struct rb_root state_owners;
diff -up linux-2.6.38.noarch/include/linux/nfs_iostat.h.orig linux-2.6.38.noarch/include/linux/nfs_iostat.h
--- linux-2.6.38.noarch/include/linux/nfs_iostat.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfs_iostat.h 2011-03-26 07:57:44.318820937 -0400
@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters {
NFSIOS_SHORTREAD,
NFSIOS_SHORTWRITE,
NFSIOS_DELAY,
+ NFSIOS_PNFS_READ,
+ NFSIOS_PNFS_WRITE,
+ NFSIOS_PNFS_COMMIT,
__NFSIOS_COUNTSMAX,
};
diff -up linux-2.6.38.noarch/include/linux/nfs_page.h.orig linux-2.6.38.noarch/include/linux/nfs_page.h
--- linux-2.6.38.noarch/include/linux/nfs_page.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfs_page.h 2011-03-26 07:57:44.319820928 -0400
@@ -49,6 +49,7 @@ struct nfs_page {
struct kref wb_kref; /* reference count */
unsigned long wb_flags;
struct nfs_writeverf wb_verf; /* Commit cookie */
+ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */
};
struct nfs_pageio_descriptor {
@@ -62,6 +63,11 @@ struct nfs_pageio_descriptor {
int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
int pg_ioflags;
int pg_error;
+ struct pnfs_layout_segment *pg_lseg;
+#ifdef CONFIG_NFS_V4_1
+ int pg_iswrite;
+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+#endif /* CONFIG_NFS_V4_1 */
};
#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
@@ -70,13 +76,15 @@ extern struct nfs_page *nfs_create_reque
struct inode *inode,
struct page *page,
unsigned int offset,
- unsigned int count);
+ unsigned int count,
+ struct pnfs_layout_segment *lseg);
extern void nfs_clear_request(struct nfs_page *req);
extern void nfs_release_request(struct nfs_page *req);
extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
- pgoff_t idx_start, unsigned int npages, int tag);
+ pgoff_t idx_start, unsigned int npages, int tag,
+ int *use_pnfs);
extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
struct inode *inode,
int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
diff -up linux-2.6.38.noarch/include/linux/nfs_xdr.h.orig linux-2.6.38.noarch/include/linux/nfs_xdr.h
--- linux-2.6.38.noarch/include/linux/nfs_xdr.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/nfs_xdr.h 2011-03-26 07:57:44.321820912 -0400
@@ -3,6 +3,9 @@
#include <linux/nfsacl.h>
#include <linux/nfs3.h>
+#include <linux/kref.h>
+#include <linux/nfs4.h>
+#include <linux/sunrpc/sched.h>
/*
* To change the maximum rsize and wsize supported by the NFS client, adjust
@@ -10,7 +13,7 @@
* support a megabyte or more. The default is left at 4096 bytes, which is
* reasonable for NFS over UDP.
*/
-#define NFS_MAX_FILE_IO_SIZE (1048576U)
+#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U)
#define NFS_DEF_FILE_IO_SIZE (4096U)
#define NFS_MIN_FILE_IO_SIZE (1024U)
@@ -115,6 +118,7 @@ struct nfs_fsinfo {
struct timespec time_delta; /* server time granularity */
__u32 lease_time; /* in seconds */
__u32 layouttype; /* supported pnfs layout driver */
+ __u32 blksize; /* preferred pnfs io block size */
};
struct nfs_fsstat {
@@ -226,6 +230,73 @@ struct nfs4_layoutget {
struct pnfs_layout_segment **lsegpp;
};
+struct nfs4_layoutcommit_args {
+ nfs4_stateid stateid;
+ __u64 lastbytewritten;
+ __u32 time_modify_changed;
+ struct timespec time_modify;
+ const u32 *bitmask;
+ struct nfs_fh *fh;
+ struct inode *inode;
+
+ /* Values set by layout driver */
+ struct pnfs_layout_range range;
+ __u32 layout_type;
+ void *layoutdriver_data;
+ struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_layoutcommit_res {
+ __u32 sizechanged;
+ __u64 newsize;
+ struct nfs_fattr *fattr;
+ const struct nfs_server *server;
+ struct nfs4_sequence_res seq_res;
+ int status;
+};
+
+struct nfs4_layoutcommit_data {
+ struct rpc_task task;
+ struct rpc_cred *cred;
+ struct nfs_fattr fattr;
+ struct nfs4_layoutcommit_args args;
+ struct nfs4_layoutcommit_res res;
+};
+
+struct nfs4_layoutreturn_args {
+ __u32 reclaim;
+ __u32 layout_type;
+ __u32 return_type;
+ struct pnfs_layout_range range;
+ struct inode *inode;
+ struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_layoutreturn_res {
+ struct nfs4_sequence_res seq_res;
+ u32 lrs_present;
+ nfs4_stateid stateid;
+};
+
+struct nfs4_layoutreturn {
+ struct nfs4_layoutreturn_args args;
+ struct nfs4_layoutreturn_res res;
+ struct rpc_cred *cred;
+ struct nfs_client *clp;
+ int rpc_status;
+};
+
+struct nfs4_getdevicelist_args {
+ const struct nfs_fh *fh;
+ u32 layoutclass;
+ struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_getdevicelist_res {
+ struct pnfs_devicelist *devlist;
+ struct nfs4_sequence_res seq_res;
+};
+
struct nfs4_getdeviceinfo_args {
struct pnfs_device *pdev;
struct nfs4_sequence_args seq_args;
@@ -889,7 +960,7 @@ struct nfs4_server_caps_arg {
};
struct nfs4_server_caps_res {
- u32 attr_bitmask[2];
+ u32 attr_bitmask[3];
u32 acl_bitmask;
u32 has_links;
u32 has_symlinks;
@@ -1004,6 +1075,30 @@ struct nfs_page;
#define NFS_PAGEVEC_SIZE (8U)
+#if defined(CONFIG_NFS_V4_1)
+
+/* pnfsflag values */
+enum pnfs_flags {
+ PNFS_NO_RPC = 1 << 0, /* non rpc result callback switch */
+};
+
+/* pnfs-specific data needed for read, write, and commit calls */
+struct pnfs_call_data {
+ struct pnfs_layout_segment *lseg;
+ const struct rpc_call_ops *call_ops;
+ u32 orig_count; /* for retry via MDS */
+ int pnfs_error;
+ u8 pnfsflags;
+ u8 how; /* for FLUSH_STABLE */
+};
+
+/* files layout-type specific data for read, write, and commit */
+struct pnfs_fl_call_data {
+ struct nfs_client *ds_nfs_client;
+ __u64 orig_offset;
+};
+#endif /* CONFIG_NFS_V4_1 */
+
struct nfs_read_data {
int flags;
struct rpc_task task;
@@ -1019,10 +1114,16 @@ struct nfs_read_data {
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
+#if defined(CONFIG_NFS_V4_1)
+ struct pnfs_call_data pdata;
+ struct pnfs_fl_call_data fldata;
+#endif /* CONFIG_NFS_V4_1 */
struct page *page_array[NFS_PAGEVEC_SIZE];
};
struct nfs_write_data {
+ struct kref refcount; /* For pnfs commit splitting */
+ struct nfs_write_data *parent; /* For pnfs commit splitting */
int flags;
struct rpc_task task;
struct inode *inode;
@@ -1038,6 +1139,10 @@ struct nfs_write_data {
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
+#if defined(CONFIG_NFS_V4_1)
+ struct pnfs_call_data pdata;
+ struct pnfs_fl_call_data fldata;
+#endif /* CONFIG_NFS_V4_1 */
struct page *page_array[NFS_PAGEVEC_SIZE];
};
diff -up linux-2.6.38.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.38.noarch/include/linux/panfs_shim_api.h
--- linux-2.6.38.noarch/include/linux/panfs_shim_api.h.orig 2011-03-26 07:57:44.329820843 -0400
+++ linux-2.6.38.noarch/include/linux/panfs_shim_api.h 2011-03-26 07:57:44.329820843 -0400
@@ -0,0 +1,57 @@
+#ifndef _PANFS_SHIM_API_H
+#define _PANFS_SHIM_API_H
+
+/*
+ * imported panfs functions
+ */
+struct panfs_export_operations {
+ int (*convert_rc)(pan_status_t rc);
+
+ int (*sm_sec_t_get_size_otw)(
+ pan_sm_sec_otw_t *var,
+ pan_size_t *core_sizep,
+ pan_size_t *wire_size,
+ void *buf_end);
+
+ int (*sm_sec_t_unmarshall)(
+ pan_sm_sec_otw_t *in,
+ pan_sm_sec_t *out,
+ void *buf,
+ pan_size_t size,
+ pan_size_t *otw_consumed,
+ pan_size_t *in_core_consumed);
+
+ int (*ucreds_get)(void **ucreds_pp);
+
+ void (*ucreds_put)(void *ucreds);
+
+ int (*sam_read)(
+ pan_sam_access_flags_t flags,
+ pan_sam_read_args_t *args_p,
+ pan_sam_obj_sec_t *obj_sec_p,
+ pan_sg_entry_t *data_p,
+ void *ucreds,
+ pan_sam_read_cb_t closure,
+ void *user_arg1,
+ void *user_arg2,
+ pan_sam_read_res_t *res_p);
+
+ int (*sam_write)(
+ pan_sam_access_flags_t flags,
+ pan_sam_write_args_t *args_p,
+ pan_sam_obj_sec_t *obj_sec_p,
+ pan_sg_entry_t *data_p,
+ void *ucreds,
+ pan_sam_write_cb_t closure,
+ void *user_arg1,
+ void *user_arg2,
+ pan_sam_write_res_t *res_p);
+};
+
+extern int
+panfs_shim_register(struct panfs_export_operations *ops);
+
+extern int
+panfs_shim_unregister(void);
+
+#endif /* _PANFS_SHIM_API_H */
diff -up linux-2.6.38.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.38.noarch/include/linux/pnfs_osd_xdr.h
--- linux-2.6.38.noarch/include/linux/pnfs_osd_xdr.h.orig 2011-03-26 07:57:44.331820825 -0400
+++ linux-2.6.38.noarch/include/linux/pnfs_osd_xdr.h 2011-03-26 07:57:44.331820825 -0400
@@ -0,0 +1,439 @@
+/*
+ * pnfs_osd_xdr.h
+ *
+ * pNFS-osd on-the-wire data structures
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __PNFS_OSD_XDR_H__
+#define __PNFS_OSD_XDR_H__
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/exp_xdr.h>
+#include <scsi/osd_protocol.h>
+
+#define PNFS_OSD_OSDNAME_MAXSIZE 256
+
+/*
+ * START OF "GENERIC" DECODE ROUTINES.
+ * These may look a little ugly since they are imported from a "generic"
+ * set of XDR encode/decode routines which are intended to be shared by
+ * all of our NFSv4 implementations (OpenBSD, MacOS X...).
+ *
+ * If the pain of reading these is too great, it should be a straightforward
+ * task to translate them into Linux-specific versions which are more
+ * consistent with the style used in NFSv2/v3...
+ */
+#define READ32(x) (x) = ntohl(*p++)
+#define READ64(x) do { \
+ (x) = (u64)ntohl(*p++) << 32; \
+ (x) |= ntohl(*p++); \
+} while (0)
+#define COPYMEM(x, nbytes) do { \
+ memcpy((x), p, nbytes); \
+ p += XDR_QUADLEN(nbytes); \
+} while (0)
+
+/*
+ * draft-ietf-nfsv4-minorversion-22
+ * draft-ietf-nfsv4-pnfs-obj-12
+ */
+
+/* Layout Structure */
+
+enum pnfs_osd_raid_algorithm4 {
+ PNFS_OSD_RAID_0 = 1,
+ PNFS_OSD_RAID_4 = 2,
+ PNFS_OSD_RAID_5 = 3,
+ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
+};
+
+/* struct pnfs_osd_data_map4 {
+ * uint32_t odm_num_comps;
+ * length4 odm_stripe_unit;
+ * uint32_t odm_group_width;
+ * uint32_t odm_group_depth;
+ * uint32_t odm_mirror_cnt;
+ * pnfs_osd_raid_algorithm4 odm_raid_algorithm;
+ * };
+ */
+struct pnfs_osd_data_map {
+ u32 odm_num_comps;
+ u64 odm_stripe_unit;
+ u32 odm_group_width;
+ u32 odm_group_depth;
+ u32 odm_mirror_cnt;
+ u32 odm_raid_algorithm;
+};
+
+static inline int
+pnfs_osd_data_map_xdr_sz(void)
+{
+ return 1 + 2 + 1 + 1 + 1 + 1;
+}
+
+static inline size_t
+pnfs_osd_data_map_incore_sz(void)
+{
+ return sizeof(struct pnfs_osd_data_map);
+}
+
+/* struct pnfs_osd_objid4 {
+ * deviceid4 oid_device_id;
+ * uint64_t oid_partition_id;
+ * uint64_t oid_object_id;
+ * };
+ */
+struct pnfs_osd_objid {
+ struct nfs4_deviceid oid_device_id;
+ u64 oid_partition_id;
+ u64 oid_object_id;
+};
+
+/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */
+#define _DEVID_LO(oid_device_id) \
+ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data)
+
+#define _DEVID_HI(oid_device_id) \
+ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1)
+
+static inline int
+pnfs_osd_objid_xdr_sz(void)
+{
+ return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2;
+}
+
+static inline size_t
+pnfs_osd_objid_incore_sz(void)
+{
+ return sizeof(struct pnfs_osd_objid);
+}
+
+enum pnfs_osd_version {
+ PNFS_OSD_MISSING = 0,
+ PNFS_OSD_VERSION_1 = 1,
+ PNFS_OSD_VERSION_2 = 2
+};
+
+struct pnfs_osd_opaque_cred {
+ u32 cred_len;
+ u8 *cred;
+};
+
+static inline int
+pnfs_osd_opaque_cred_xdr_sz(u32 *p)
+{
+ u32 *start = p;
+ u32 n;
+
+ READ32(n);
+ p += XDR_QUADLEN(n);
+ return p - start;
+}
+
+static inline size_t
+pnfs_osd_opaque_cred_incore_sz(u32 *p)
+{
+ u32 n;
+
+ READ32(n);
+ return XDR_QUADLEN(n) * 4;
+}
+
+enum pnfs_osd_cap_key_sec {
+ PNFS_OSD_CAP_KEY_SEC_NONE = 0,
+ PNFS_OSD_CAP_KEY_SEC_SSV = 1,
+};
+
+/* struct pnfs_osd_object_cred4 {
+ * pnfs_osd_objid4 oc_object_id;
+ * pnfs_osd_version4 oc_osd_version;
+ * pnfs_osd_cap_key_sec4 oc_cap_key_sec;
+ * opaque oc_capability_key<>;
+ * opaque oc_capability<>;
+ * };
+ */
+struct pnfs_osd_object_cred {
+ struct pnfs_osd_objid oc_object_id;
+ u32 oc_osd_version;
+ u32 oc_cap_key_sec;
+ struct pnfs_osd_opaque_cred oc_cap_key;
+ struct pnfs_osd_opaque_cred oc_cap;
+};
+
+static inline int
+pnfs_osd_object_cred_xdr_sz(u32 *p)
+{
+ u32 *start = p;
+
+ p += pnfs_osd_objid_xdr_sz() + 2;
+ p += pnfs_osd_opaque_cred_xdr_sz(p);
+ p += pnfs_osd_opaque_cred_xdr_sz(p);
+ return p - start;
+}
+
+static inline size_t
+pnfs_osd_object_cred_incore_sz(u32 *p)
+{
+ size_t sz = sizeof(struct pnfs_osd_object_cred);
+
+ p += pnfs_osd_objid_xdr_sz() + 2;
+ sz += pnfs_osd_opaque_cred_incore_sz(p);
+ p += pnfs_osd_opaque_cred_xdr_sz(p);
+ sz += pnfs_osd_opaque_cred_incore_sz(p);
+ return sz;
+}
+
+/* struct pnfs_osd_layout4 {
+ * pnfs_osd_data_map4 olo_map;
+ * uint32_t olo_comps_index;
+ * pnfs_osd_object_cred4 olo_components<>;
+ * };
+ */
+struct pnfs_osd_layout {
+ struct pnfs_osd_data_map olo_map;
+ u32 olo_comps_index;
+ u32 olo_num_comps;
+ struct pnfs_osd_object_cred *olo_comps;
+};
+
+static inline int
+pnfs_osd_layout_xdr_sz(u32 *p)
+{
+ u32 *start = p;
+ u32 n;
+
+ p += pnfs_osd_data_map_xdr_sz() + 1;
+ READ32(n);
+ while ((int)(n--) > 0)
+ p += pnfs_osd_object_cred_xdr_sz(p);
+ return p - start;
+}
+
+static inline size_t
+pnfs_osd_layout_incore_sz(u32 *p)
+{
+ u32 n;
+ size_t sz;
+
+ p += pnfs_osd_data_map_xdr_sz() + 1;
+ READ32(n);
+ sz = sizeof(struct pnfs_osd_layout);
+ while ((int)(n--) > 0) {
+ sz += pnfs_osd_object_cred_incore_sz(p);
+ p += pnfs_osd_object_cred_xdr_sz(p);
+ }
+ return sz;
+}
+
+/* Device Address */
+
+enum pnfs_osd_targetid_type {
+ OBJ_TARGET_ANON = 1,
+ OBJ_TARGET_SCSI_NAME = 2,
+ OBJ_TARGET_SCSI_DEVICE_ID = 3,
+};
+
+/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) {
+ * case OBJ_TARGET_SCSI_NAME:
+ * string oti_scsi_name<>;
+ *
+ * case OBJ_TARGET_SCSI_DEVICE_ID:
+ * opaque oti_scsi_device_id<>;
+ *
+ * default:
+ * void;
+ * };
+ *
+ * union pnfs_osd_targetaddr4 switch (bool ota_available) {
+ * case TRUE:
+ * netaddr4 ota_netaddr;
+ * case FALSE:
+ * void;
+ * };
+ *
+ * struct pnfs_osd_deviceaddr4 {
+ * pnfs_osd_targetid4 oda_targetid;
+ * pnfs_osd_targetaddr4 oda_targetaddr;
+ * uint64_t oda_lun;
+ * opaque oda_systemid<>;
+ * pnfs_osd_object_cred4 oda_root_obj_cred;
+ * opaque oda_osdname<>;
+ * };
+ */
+struct pnfs_osd_targetid {
+ u32 oti_type;
+ struct nfs4_string oti_scsi_device_id;
+};
+
+enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 };
+
+/* struct netaddr4 {
+ * // see struct rpcb in RFC1833
+ * string r_netid<>; // network id
+ * string r_addr<>; // universal address
+ * };
+ */
+struct pnfs_osd_net_addr {
+ struct nfs4_string r_netid;
+ struct nfs4_string r_addr;
+};
+
+struct pnfs_osd_targetaddr {
+ u32 ota_available;
+ struct pnfs_osd_net_addr ota_netaddr;
+};
+
+enum {
+ NETWORK_ID_MAX = 16 / 4,
+ UNIVERSAL_ADDRESS_MAX = 64 / 4,
+ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX,
+};
+
+struct pnfs_osd_deviceaddr {
+ struct pnfs_osd_targetid oda_targetid;
+ struct pnfs_osd_targetaddr oda_targetaddr;
+ u8 oda_lun[8];
+ struct nfs4_string oda_systemid;
+ struct pnfs_osd_object_cred oda_root_obj_cred;
+ struct nfs4_string oda_osdname;
+};
+
+enum {
+ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4,
+ PNFS_OSD_DEVICEADDR_MAX =
+ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX +
+ 2 /*oda_lun*/ +
+ 1 + OSD_SYSTEMID_LEN +
+ 1 + ODA_OSDNAME_MAX,
+};
+
+/* LAYOUTCOMMIT: layoutupdate */
+
+/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) {
+ * case TRUE:
+ * int64_t dsu_delta;
+ * case FALSE:
+ * void;
+ * };
+ *
+ * struct pnfs_osd_layoutupdate4 {
+ * pnfs_osd_deltaspaceused4 olu_delta_space_used;
+ * bool olu_ioerr_flag;
+ * };
+ */
+struct pnfs_osd_layoutupdate {
+ u32 dsu_valid;
+ s64 dsu_delta;
+ u32 olu_ioerr_flag;
+};
+
+/* LAYOUTRETURN: I/O Rrror Report */
+
+enum pnfs_osd_errno {
+ PNFS_OSD_ERR_EIO = 1,
+ PNFS_OSD_ERR_NOT_FOUND = 2,
+ PNFS_OSD_ERR_NO_SPACE = 3,
+ PNFS_OSD_ERR_BAD_CRED = 4,
+ PNFS_OSD_ERR_NO_ACCESS = 5,
+ PNFS_OSD_ERR_UNREACHABLE = 6,
+ PNFS_OSD_ERR_RESOURCE = 7
+};
+
+/* struct pnfs_osd_ioerr4 {
+ * pnfs_osd_objid4 oer_component;
+ * length4 oer_comp_offset;
+ * length4 oer_comp_length;
+ * bool oer_iswrite;
+ * pnfs_osd_errno4 oer_errno;
+ * };
+ */
+struct pnfs_osd_ioerr {
+ struct pnfs_osd_objid oer_component;
+ u64 oer_comp_offset;
+ u64 oer_comp_length;
+ u32 oer_iswrite;
+ u32 oer_errno;
+};
+
+static inline unsigned
+pnfs_osd_ioerr_xdr_sz(void)
+{
+ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1;
+}
+
+/* OSD XDR API */
+
+/* Layout helpers */
+extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout(
+ struct pnfs_osd_layout *layout, u32 *p);
+
+extern int pnfs_osd_xdr_encode_layout(
+ struct exp_xdr_stream *xdr,
+ struct pnfs_osd_layout *layout);
+
+/* Device Info helpers */
+
+/* First pass calculate total size for space needed */
+extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p);
+
+/* Note: some strings pointed to inside @deviceaddr might point
+ * to space inside @p. @p should stay valid while @deviceaddr
+ * is in use.
+ * It is assumed that @deviceaddr points to bigger memory of size
+ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz()
+ */
+extern void pnfs_osd_xdr_decode_deviceaddr(
+ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p);
+
+/* For Servers */
+extern int pnfs_osd_xdr_encode_deviceaddr(
+ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr);
+
+/* layoutupdate (layout_commit) xdr helpers */
+extern int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+ struct pnfs_osd_layoutupdate *lou);
+extern __be32 *
+pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p);
+
+/* osd_ioerror encoding/decoding (layout_return) */
+extern int
+pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr);
+extern __be32 *
+pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p);
+
+#endif /* __PNFS_OSD_XDR_H__ */
diff -up linux-2.6.38.noarch/include/linux/posix_acl.h.orig linux-2.6.38.noarch/include/linux/posix_acl.h
--- linux-2.6.38.noarch/include/linux/posix_acl.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/posix_acl.h 2011-03-26 07:57:44.332820817 -0400
@@ -8,6 +8,7 @@
#ifndef __LINUX_POSIX_ACL_H
#define __LINUX_POSIX_ACL_H
+#include <linux/fs.h>
#include <linux/slab.h>
#define ACL_UNDEFINED_ID (-1)
diff -up linux-2.6.38.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.38.noarch/include/linux/sunrpc/msg_prot.h
--- linux-2.6.38.noarch/include/linux/sunrpc/msg_prot.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/sunrpc/msg_prot.h 2011-03-26 07:57:44.332820817 -0400
@@ -14,6 +14,8 @@
/* size of an XDR encoding unit in bytes, i.e. 32bit */
#define XDR_UNIT (4)
+#include <linux/types.h>
+
/* spec defines authentication flavor as an unsigned 32 bit integer */
typedef u32 rpc_authflavor_t;
diff -up linux-2.6.38.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.38.noarch/include/linux/sunrpc/rpc_pipe_fs.h
--- linux-2.6.38.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2011-03-26 07:57:44.333820809 -0400
@@ -3,6 +3,7 @@
#ifdef __KERNEL__
+#include <linux/fs.h>
#include <linux/workqueue.h>
struct rpc_pipe_msg {
@@ -11,6 +12,10 @@ struct rpc_pipe_msg {
size_t len;
size_t copied;
int errno;
+#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */
+#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */
+#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA
+ u8 flags;
};
struct rpc_pipe_ops {
diff -up linux-2.6.38.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.38.noarch/include/linux/sunrpc/simple_rpc_pipefs.h
--- linux-2.6.38.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2011-03-26 07:57:44.334820801 -0400
+++ linux-2.6.38.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2011-03-26 07:57:44.334820801 -0400
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2008 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * David M. Richter <richterd@citi.umich.edu>
+ *
+ * Drawing on work done by Andy Adamson <andros@citi.umich.edu> and
+ * Marius Eriksen <marius@monkey.org>. Thanks for the help over the
+ * years, guys.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * With thanks to CITI's project sponsor and partner, IBM.
+ */
+
+#ifndef _SIMPLE_RPC_PIPEFS_H_
+#define _SIMPLE_RPC_PIPEFS_H_
+
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#define payload_of(headerp) ((void *)(headerp + 1))
+
+/*
+ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs.
+ * Messages may simply be the header itself, although having an optional
+ * data payload follow the header allows much more flexibility.
+ *
+ * Messages are created using pipefs_alloc_init_msg() and
+ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an
+ * (optional) data payload.
+ *
+ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data
+ * can be accessed using: struct foo *foop = payload_of(msg)
+ */
+struct pipefs_hdr {
+ u32 msgid;
+ u8 type;
+ u8 flags;
+ u16 totallen; /* length of entire message, including hdr itself */
+ u32 status;
+};
+
+/*
+ * struct pipefs_list -- a type of list used for tracking callers who've made an
+ * upcall and are blocked waiting for a reply.
+ *
+ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply().
+ */
+struct pipefs_list {
+ struct list_head list;
+ spinlock_t list_lock;
+};
+
+
+/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */
+extern struct dentry *pipefs_mkpipe(const char *name,
+ const struct rpc_pipe_ops *ops,
+ int wait_for_open);
+extern void pipefs_closepipe(struct dentry *pipe);
+extern void pipefs_init_list(struct pipefs_list *list);
+extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags,
+ void *data, u16 datalen);
+extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type,
+ u8 flags, void *data,
+ u16 datalen, u16 padlen);
+extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe,
+ struct pipefs_hdr *msg,
+ struct pipefs_list
+ *uplist, u8 upflags,
+ u32 timeout);
+extern int pipefs_queue_upcall_noreply(struct dentry *pipe,
+ struct pipefs_hdr *msg, u8 upflags);
+extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply,
+ struct pipefs_list *uplist);
+extern struct pipefs_hdr *pipefs_readmsg(struct file *filp,
+ const char __user *src, size_t len);
+extern ssize_t pipefs_generic_upcall(struct file *filp,
+ struct rpc_pipe_msg *rpcmsg,
+ char __user *dst, size_t buflen);
+extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg);
+
+#endif /* _SIMPLE_RPC_PIPEFS_H_ */
diff -up linux-2.6.38.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.38.noarch/include/linux/sunrpc/svc_xprt.h
--- linux-2.6.38.noarch/include/linux/sunrpc/svc_xprt.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/sunrpc/svc_xprt.h 2011-03-26 07:57:44.334820801 -0400
@@ -204,4 +204,41 @@ static inline char *__svc_print_addr(con
return buf;
}
+
+/*
+ * Print a network address in a universal format (see rfc1833 and nfsv4.1)
+ */
+static inline int __svc_print_netaddr(struct sockaddr *addr,
+ struct xdr_netobj *na)
+{
+ u16 port;
+ ssize_t len;
+
+ switch (addr->sa_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ port = ntohs(sin->sin_port);
+
+ len = snprintf(na->data, na->len, "%pI4.%u.%u",
+ &sin->sin_addr,
+ port >> 8, port & 0xff);
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+ port = ntohs(sin6->sin6_port);
+
+ len = snprintf(na->data, na->len, "%pI6.%u.%u",
+ &sin6->sin6_addr,
+ port >> 8, port & 0xff);
+ break;
+ }
+ default:
+ snprintf(na->data, na->len, "unknown address type: %d",
+ addr->sa_family);
+ len = -EINVAL;
+ break;
+ }
+ return len;
+}
#endif /* SUNRPC_SVC_XPRT_H */
diff -up linux-2.6.38.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.38.noarch/include/linux/sunrpc/xdr.h
--- linux-2.6.38.noarch/include/linux/sunrpc/xdr.h.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/include/linux/sunrpc/xdr.h 2011-03-26 07:57:44.335820793 -0400
@@ -213,6 +213,7 @@ typedef int (*kxdrdproc_t)(void *rqstp,
extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
+extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q);
extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
unsigned int base, unsigned int len);
extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
diff -up linux-2.6.38.noarch/net/sunrpc/Makefile.orig linux-2.6.38.noarch/net/sunrpc/Makefile
--- linux-2.6.38.noarch/net/sunrpc/Makefile.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/net/sunrpc/Makefile 2011-03-26 07:57:44.336820785 -0400
@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt
svc.o svcsock.o svcauth.o svcauth_unix.o \
addr.o rpcb_clnt.o timer.o xdr.o \
sunrpc_syms.o cache.o rpc_pipe.o \
- svc_xprt.o
+ svc_xprt.o simple_rpc_pipefs.o
sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o
sunrpc-$(CONFIG_PROC_FS) += stats.o
sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff -up linux-2.6.38.noarch/net/sunrpc/sched.c.orig linux-2.6.38.noarch/net/sunrpc/sched.c
--- linux-2.6.38.noarch/net/sunrpc/sched.c.orig 2011-03-26 07:53:04.357196210 -0400
+++ linux-2.6.38.noarch/net/sunrpc/sched.c 2011-03-26 07:57:44.337820776 -0400
@@ -787,11 +787,11 @@ EXPORT_SYMBOL_GPL(rpc_free);
/*
* Creation and deletion of RPC task structures
*/
-static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data)
+static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data, unsigned short extra_flags)
{
memset(task, 0, sizeof(*task));
atomic_set(&task->tk_count, 1);
- task->tk_flags = task_setup_data->flags;
+ task->tk_flags = task_setup_data->flags | extra_flags;
task->tk_ops = task_setup_data->callback_ops;
task->tk_calldata = task_setup_data->callback_data;
INIT_LIST_HEAD(&task->tk_task);
@@ -840,14 +840,14 @@ struct rpc_task *rpc_new_task(const stru
flags = RPC_TASK_DYNAMIC;
}
- rpc_init_task(task, setup_data);
+ rpc_init_task(task, setup_data, flags);
+
if (task->tk_status < 0) {
int err = task->tk_status;
rpc_put_task(task);
return ERR_PTR(err);
}
- task->tk_flags |= flags;
dprintk("RPC: allocated task %p\n", task);
return task;
}
diff -up linux-2.6.38.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.38.noarch/net/sunrpc/simple_rpc_pipefs.c
--- linux-2.6.38.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2011-03-26 07:57:44.338820767 -0400
+++ linux-2.6.38.noarch/net/sunrpc/simple_rpc_pipefs.c 2011-03-26 07:57:44.338820767 -0400
@@ -0,0 +1,423 @@
+/*
+ * net/sunrpc/simple_rpc_pipefs.c
+ *
+ * Copyright (c) 2008 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * David M. Richter <richterd@citi.umich.edu>
+ *
+ * Drawing on work done by Andy Adamson <andros@citi.umich.edu> and
+ * Marius Eriksen <marius@monkey.org>. Thanks for the help over the
+ * years, guys.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * With thanks to CITI's project sponsor and partner, IBM.
+ */
+
+#include <linux/mount.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/simple_rpc_pipefs.h>
+
+
+/*
+ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs
+ * filesystem.
+ *
+ * If @wait_for_open is non-zero and an upcall is later queued but the userland
+ * end of the pipe has not yet been opened, the upcall will remain queued until
+ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE.
+ */
+struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops,
+ int wait_for_open)
+{
+ struct dentry *dir, *pipe;
+ struct vfsmount *mnt;
+
+ mnt = rpc_get_mount();
+ if (IS_ERR(mnt)) {
+ pipe = ERR_CAST(mnt);
+ goto out;
+ }
+ dir = mnt->mnt_root;
+ if (!dir) {
+ pipe = ERR_PTR(-ENOENT);
+ goto out;
+ }
+ pipe = rpc_mkpipe(dir, name, NULL, ops,
+ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0);
+out:
+ return pipe;
+}
+EXPORT_SYMBOL(pipefs_mkpipe);
+
+/*
+ * Shutdown a pipe made by pipefs_mkpipe().
+ * XXX: do we need to retain an extra reference on the mount?
+ */
+void pipefs_closepipe(struct dentry *pipe)
+{
+ rpc_unlink(pipe);
+ rpc_put_mount();
+}
+EXPORT_SYMBOL(pipefs_closepipe);
+
+/*
+ * Initialize a struct pipefs_list -- which are a way to keep track of callers
+ * who're blocked having made an upcall and are awaiting a reply.
+ *
+ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how
+ * to use them.
+ */
+inline void pipefs_init_list(struct pipefs_list *list)
+{
+ INIT_LIST_HEAD(&list->list);
+ spin_lock_init(&list->list_lock);
+}
+EXPORT_SYMBOL(pipefs_init_list);
+
+/*
+ * Alloc/init a generic pipefs message header and copy into its message body
+ * an arbitrary data payload.
+ *
+ * struct pipefs_hdr's are meant to serve as generic, general-purpose message
+ * headers for easy rpc_pipefs I/O. When an upcall is made, the
+ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered
+ * therein. --And yes, the naming can seem a little confusing at first:
+ *
+ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a
+ * struct pipefs_hdr (possibly with an attached message body). A
+ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real"
+ * message is delivered and processed.
+ */
+struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags,
+ void *data, u16 datalen, u16 padlen)
+{
+ u16 totallen;
+ struct pipefs_hdr *msg = NULL;
+
+ totallen = sizeof(*msg) + datalen + padlen;
+ if (totallen > PAGE_SIZE) {
+ msg = ERR_PTR(-E2BIG);
+ goto out;
+ }
+
+ msg = kzalloc(totallen, GFP_KERNEL);
+ if (!msg) {
+ msg = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ msg->msgid = msgid;
+ msg->type = type;
+ msg->flags = flags;
+ msg->totallen = totallen;
+ memcpy(payload_of(msg), data, datalen);
+out:
+ return msg;
+}
+EXPORT_SYMBOL(pipefs_alloc_init_msg_padded);
+
+/*
+ * See the description of pipefs_alloc_init_msg_padded().
+ */
+struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags,
+ void *data, u16 datalen)
+{
+ return pipefs_alloc_init_msg_padded(msgid, type, flags, data,
+ datalen, 0);
+}
+EXPORT_SYMBOL(pipefs_alloc_init_msg);
+
+
+static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg,
+ struct pipefs_hdr *msg, u8 upflags)
+{
+ memset(rpcmsg, 0, sizeof(*rpcmsg));
+ rpcmsg->data = msg;
+ rpcmsg->len = msg->totallen;
+ rpcmsg->flags = upflags;
+}
+
+static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg,
+ u8 upflags)
+{
+ struct rpc_pipe_msg *rpcmsg;
+
+ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL);
+ if (!rpcmsg)
+ return ERR_PTR(-ENOMEM);
+
+ pipefs_init_rpcmsg(rpcmsg, msg, upflags);
+ return rpcmsg;
+}
+
+
+/* represents an upcall that'll block and wait for a reply */
+struct pipefs_upcall {
+ u32 msgid;
+ struct rpc_pipe_msg rpcmsg;
+ struct list_head list;
+ wait_queue_head_t waitq;
+ struct pipefs_hdr *reply;
+};
+
+
+static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall,
+ struct pipefs_hdr *msg, u8 upflags)
+{
+ upcall->reply = NULL;
+ upcall->msgid = msg->msgid;
+ INIT_LIST_HEAD(&upcall->list);
+ init_waitqueue_head(&upcall->waitq);
+ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags);
+}
+
+static int __pipefs_queue_upcall_waitreply(struct dentry *pipe,
+ struct pipefs_upcall *upcall,
+ struct pipefs_list *uplist,
+ u32 timeout)
+{
+ int err = 0;
+ DECLARE_WAITQUEUE(wq, current);
+
+ add_wait_queue(&upcall->waitq, &wq);
+ spin_lock(&uplist->list_lock);
+ list_add(&upcall->list, &uplist->list);
+ spin_unlock(&uplist->list_lock);
+
+ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg);
+ if (err < 0)
+ goto out;
+
+ if (timeout) {
+ /* retval of 0 means timer expired */
+ err = schedule_timeout_uninterruptible(timeout);
+ if (err == 0 && upcall->reply == NULL)
+ err = -ETIMEDOUT;
+ } else {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+
+out:
+ spin_lock(&uplist->list_lock);
+ list_del_init(&upcall->list);
+ spin_unlock(&uplist->list_lock);
+ remove_wait_queue(&upcall->waitq, &wq);
+ return err;
+}
+
+/*
+ * Queue a pipefs msg for an upcall to userspace, place the calling thread
+ * on @uplist, and block the thread to wait for a reply. If @timeout is
+ * nonzero, the thread will be blocked for at most @timeout jiffies.
+ *
+ * (To convert time units into jiffies, consider the functions
+ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and
+ * timespec_to_jiffies().)
+ *
+ * Once a reply is received by your downcall handler, call
+ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall,
+ * assign the reply, and wake the waiting thread.
+ *
+ * This function's return value pointer may be an error and should be checked
+ * with IS_ERR() before attempting to access the reply message.
+ *
+ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg()
+ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG
+ * flag is set in @upflags. See also rpc_pipe_fs.h.
+ */
+struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe,
+ struct pipefs_hdr *msg,
+ struct pipefs_list *uplist,
+ u8 upflags, u32 timeout)
+{
+ int err = 0;
+ struct pipefs_upcall upcall;
+
+ pipefs_init_upcall_waitreply(&upcall, msg, upflags);
+ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout);
+ if (err < 0) {
+ kfree(upcall.reply);
+ upcall.reply = ERR_PTR(err);
+ }
+
+ return upcall.reply;
+}
+EXPORT_SYMBOL(pipefs_queue_upcall_waitreply);
+
+/*
+ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e.,
+ * no reply is expected).
+ *
+ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg()
+ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG
+ * flag is set in @upflags. See also rpc_pipe_fs.h.
+ */
+int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg,
+ u8 upflags)
+{
+ int err = 0;
+ struct rpc_pipe_msg *rpcmsg;
+
+ upflags |= PIPEFS_AUTOFREE_RPCMSG;
+ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags);
+ if (IS_ERR(rpcmsg)) {
+ err = PTR_ERR(rpcmsg);
+ goto out;
+ }
+ err = rpc_queue_upcall(pipe->d_inode, rpcmsg);
+out:
+ return err;
+}
+EXPORT_SYMBOL(pipefs_queue_upcall_noreply);
+
+
+static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid,
+ struct pipefs_list *uplist)
+{
+ struct pipefs_upcall *upcall;
+
+ spin_lock(&uplist->list_lock);
+ list_for_each_entry(upcall, &uplist->list, list)
+ if (upcall->msgid == msgid)
+ goto out;
+ upcall = NULL;
+out:
+ spin_unlock(&uplist->list_lock);
+ return upcall;
+}
+
+/*
+ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall
+ * message and have determined that it is a reply to a waiting upcall,
+ * you can use this function to find the appropriate upcall, assign the result,
+ * and wake the upcall thread.
+ *
+ * The reply message must have the same msgid as the original upcall message's.
+ *
+ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg().
+ */
+int pipefs_assign_upcall_reply(struct pipefs_hdr *reply,
+ struct pipefs_list *uplist)
+{
+ int err = 0;
+ struct pipefs_upcall *upcall;
+
+ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist);
+ if (!upcall) {
+ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall "
+ "for msgid %d\n", __func__, reply->msgid);
+ err = -ENOENT;
+ goto out;
+ }
+ upcall->reply = reply;
+ wake_up(&upcall->waitq);
+out:
+ return err;
+}
+EXPORT_SYMBOL(pipefs_assign_upcall_reply);
+
+/*
+ * Generic method to read-in and return a newly-allocated message which begins
+ * with a struct pipefs_hdr.
+ */
+struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src,
+ size_t len)
+{
+ int err = 0, hdrsize;
+ struct pipefs_hdr *msg = NULL;
+
+ hdrsize = sizeof(*msg);
+ if (len < hdrsize) {
+ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n",
+ __func__, (int) len, hdrsize);
+ err = -EINVAL;
+ goto out;
+ }
+
+ msg = kzalloc(len, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto out;
+ }
+ if (copy_from_user(msg, src, len))
+ err = -EFAULT;
+out:
+ if (err) {
+ kfree(msg);
+ msg = ERR_PTR(err);
+ }
+ return msg;
+}
+EXPORT_SYMBOL(pipefs_readmsg);
+
+/*
+ * Generic rpc_pipe_ops->upcall() handler implementation.
+ *
+ * Don't call this directly: to make an upcall, use
+ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply().
+ */
+ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg,
+ char __user *dst, size_t buflen)
+{
+ char *data;
+ ssize_t len, left;
+
+ data = (char *)rpcmsg->data + rpcmsg->copied;
+ len = rpcmsg->len - rpcmsg->copied;
+ if (len > buflen)
+ len = buflen;
+
+ left = copy_to_user(dst, data, len);
+ if (left < 0) {
+ rpcmsg->errno = left;
+ return left;
+ }
+
+ len -= left;
+ rpcmsg->copied += len;
+ rpcmsg->errno = 0;
+ return len;
+}
+EXPORT_SYMBOL(pipefs_generic_upcall);
+
+/*
+ * Generic rpc_pipe_ops->destroy_msg() handler implementation.
+ *
+ * Items are only freed if @rpcmsg->flags has been set appropriately.
+ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h.
+ */
+void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg)
+{
+ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG)
+ kfree(rpcmsg->data);
+ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG)
+ kfree(rpcmsg);
+}
+EXPORT_SYMBOL(pipefs_generic_destroy_msg);
diff -up linux-2.6.38.noarch/net/sunrpc/xdr.c.orig linux-2.6.38.noarch/net/sunrpc/xdr.c
--- linux-2.6.38.noarch/net/sunrpc/xdr.c.orig 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38.noarch/net/sunrpc/xdr.c 2011-03-26 07:57:44.338820767 -0400
@@ -518,6 +518,27 @@ __be32 * xdr_reserve_space(struct xdr_st
EXPORT_SYMBOL_GPL(xdr_reserve_space);
/**
+ * xdr_rewind_stream - rewind a stream back to some checkpoint
+ * @xdr: pointer to xdr_stream
+ * @q: some checkpoint at historical place of @xdr
+ *
+ * Restors an xdr stream to some historical point. @q must be
+ * a logical xdr point in the past that was sampled by @q = @xdr->p.
+ */
+__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q)
+{
+ size_t nbytes = (xdr->p - q) << 2;
+
+ BUG_ON(xdr->p < q);
+ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len);
+ xdr->p = q;
+ xdr->iov->iov_len -= nbytes;
+ xdr->buf->len -= nbytes;
+ return q;
+}
+EXPORT_SYMBOL_GPL(xdr_rewind_stream);
+
+/**
* xdr_write_pages - Insert a list of pages into an XDR buffer for sending
* @xdr: pointer to xdr_stream
* @pages: list of pages