diff --git a/config-generic b/config-generic index 1a4649f08..d2cbbfbc0 100644 --- a/config-generic +++ b/config-generic @@ -3548,6 +3548,15 @@ CONFIG_NFSD_V3=y CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFS_FSCACHE=y +# Enable pNFS +CONFIG_PNFS_OBJLAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_PANLAYOUT=m +CONFIG_PNFSD_LOCAL_EXPORT=y +CONFIG_PNFSD=y +CONFIG_SPNFS=y +CONFIG_SPNFS_LAYOUTSEGMENTS=y +CONFIG_SPNFS_BLOCK=y # CONFIG_NFS_USE_LEGACY_DNS is not set # CONFIG_NFS_USE_NEW_IDMAPPER is not set # CONFIG_NFSD_DEPRECATED is not set diff --git a/kernel.spec b/kernel.spec index d82cccbc0..ea1e36d00 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,8 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -# % define buildid .local +%define buildid .pnfs.2011.03.25 + ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -739,6 +740,9 @@ Patch12430: dcdbas-force-smi-to-happen-when-expected.patch # CVE-2011-1182 Patch12431: prevent-rt_sigqueueinfo-and-rt_tgsigqueueinfo-from-spoofing-the-signal-code.patch +Patch30000: pnfs-all-2.6.38-2011-03-25.patch +Patch30001: linux-2.6-pnfs-compile.patch +Patch30002: linux-2.6.35-inline.patch %endif BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root @@ -1372,6 +1376,9 @@ ApplyPatch dcdbas-force-smi-to-happen-when-expected.patch # CVE-2011-1182 ApplyPatch prevent-rt_sigqueueinfo-and-rt_tgsigqueueinfo-from-spoofing-the-signal-code.patch +ApplyPatch pnfs-all-2.6.38-2011-03-25.patch +ApplyPatch linux-2.6-pnfs-compile.patch +ApplyPatch linux-2.6.35-inline.patch # END OF PATCH APPLICATIONS %endif @@ -1980,6 +1987,9 @@ fi # and build. %changelog +* Sat Mar 26 2011 Steve Dickson 2.6.38.1-6.pnfs +- Updated to the latest pNFS branch: pnfs-all-2.6.38-2011-03-25 + * Fri Mar 25 2011 Chuck Ebbert - CVE-2011-1182: kernel signal spoofing issue - Drop unused patches already applied upstream: diff --git a/linux-2.6-pnfs-compile.patch b/linux-2.6-pnfs-compile.patch new file mode 100644 index 000000000..3dc52b7c9 --- /dev/null +++ b/linux-2.6-pnfs-compile.patch @@ -0,0 +1,12 @@ +diff -up linux-2.6.38.noarch/include/net/inet_connection_sock.h.orig linux-2.6.38.noarch/include/net/inet_connection_sock.h +--- linux-2.6.38.noarch/include/net/inet_connection_sock.h.orig 2011-03-26 08:15:35.417892830 -0400 ++++ linux-2.6.38.noarch/include/net/inet_connection_sock.h 2011-03-26 08:15:45.301801362 -0400 +@@ -23,7 +23,7 @@ + #include + #include + +-#define INET_CSK_DEBUG 1 ++//#define INET_CSK_DEBUG 1 + + /* Cancel timers, when they are not required. */ + #undef INET_CSK_CLEAR_TIMERS diff --git a/linux-2.6.35-inline.patch b/linux-2.6.35-inline.patch new file mode 100644 index 000000000..c56d8da5e --- /dev/null +++ b/linux-2.6.35-inline.patch @@ -0,0 +1,11 @@ +diff -up linux-2.6.34.noarch/arch/x86/Makefile.orig linux-2.6.34.noarch/arch/x86/Makefile +--- linux-2.6.34.noarch/arch/x86/Makefile.orig 2010-07-01 13:33:21.859627499 -0400 ++++ linux-2.6.34.noarch/arch/x86/Makefile 2010-07-01 13:36:26.751576450 -0400 +@@ -81,6 +81,7 @@ ifdef CONFIG_CC_STACKPROTECTOR + $(warning stack protector enabled but no compiler support) + endif + endif ++KBUILD_CFLAGS += -fno-inline-functions-called-once + + # Don't unroll struct assignments with kmemcheck enabled + ifeq ($(CONFIG_KMEMCHECK),y) diff --git a/pnfs-all-2.6.38-2011-03-25.patch b/pnfs-all-2.6.38-2011-03-25.patch new file mode 100644 index 000000000..00c3db299 --- /dev/null +++ b/pnfs-all-2.6.38-2011-03-25.patch @@ -0,0 +1,27007 @@ +diff -up linux-2.6.38.noarch/Documentation/filesystems/Locking.orig linux-2.6.38.noarch/Documentation/filesystems/Locking +--- linux-2.6.38.noarch/Documentation/filesystems/Locking.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/Documentation/filesystems/Locking 2011-03-26 07:57:44.217821899 -0400 +@@ -21,6 +21,7 @@ prototypes: + char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); + struct vfsmount *(*d_automount)(struct path *path); + int (*d_manage)(struct dentry *, bool); ++ void (*d_unlink)(struct dentry *, struct dentry *); + + locking rules: + rename_lock ->d_lock may block rcu-walk +@@ -33,6 +34,7 @@ d_iput: no no yes no + d_dname: no no no no + d_automount: no no yes no + d_manage: no no yes (ref-walk) maybe ++d_unlink no no yes no + + --------------------------- inode_operations --------------------------- + prototypes: +diff -up linux-2.6.38.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.38.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.38.noarch/Documentation/filesystems/spnfs.txt.orig 2011-03-26 07:57:44.217821899 -0400 ++++ linux-2.6.38.noarch/Documentation/filesystems/spnfs.txt 2011-03-26 07:57:44.218821875 -0400 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.38.noarch/Documentation/filesystems/vfs.txt.orig linux-2.6.38.noarch/Documentation/filesystems/vfs.txt +--- linux-2.6.38.noarch/Documentation/filesystems/vfs.txt.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/Documentation/filesystems/vfs.txt 2011-03-26 07:57:44.218821875 -0400 +@@ -866,6 +866,7 @@ struct dentry_operations { + char *(*d_dname)(struct dentry *, char *, int); + struct vfsmount *(*d_automount)(struct path *); + int (*d_manage)(struct dentry *, bool, bool); ++ void (*d_unlink)(struct dentry *, struct dentry *); + }; + + d_revalidate: called when the VFS needs to revalidate a dentry. This +@@ -973,6 +974,14 @@ struct dentry_operations { + This function is only used if DCACHE_MANAGE_TRANSIT is set on the + dentry being transited from. + ++ d_unlink: called to allow the filesystem to unlink the dentry after final ++ use. It is only called when DCACHE_NFSFS_RENAMED is set, and is ++ designed for use by 'sillyrename' schemes that are commonly ++ implemented on distributed filesystems such as NFS. ++ ++ Note that the filesystem is still responsible for protecting against ++ races with other lookups. ++ + Example : + + static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) +diff -up linux-2.6.38.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.38.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.38.noarch/drivers/md/dm-ioctl.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/drivers/md/dm-ioctl.c 2011-03-26 07:57:44.220821839 -0400 +@@ -713,6 +713,12 @@ static int dev_create(struct dm_ioctl *p + return 0; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -808,6 +814,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -990,6 +1002,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1256,6 +1274,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + struct hash_cell *hc; +diff -up linux-2.6.38.noarch/drivers/scsi/hosts.c.orig linux-2.6.38.noarch/drivers/scsi/hosts.c +--- linux-2.6.38.noarch/drivers/scsi/hosts.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/drivers/scsi/hosts.c 2011-03-26 07:57:44.221821816 -0400 +@@ -50,10 +50,11 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; ++EXPORT_SYMBOL(shost_class); + + /** + * scsi_host_set_state - Take the given host through the host state model. +diff -up linux-2.6.38.noarch/fs/dcache.c.orig linux-2.6.38.noarch/fs/dcache.c +--- linux-2.6.38.noarch/fs/dcache.c.orig 2011-03-26 07:53:05.384187507 -0400 ++++ linux-2.6.38.noarch/fs/dcache.c 2011-03-26 07:57:44.223821768 -0400 +@@ -305,6 +305,9 @@ static struct dentry *d_kill(struct dent + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); ++ ++ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) ++ dentry->d_op->d_unlink(parent, dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. +@@ -2075,6 +2078,8 @@ again: + dentry->d_flags &= ~DCACHE_CANT_MOUNT; + dentry_unlink_inode(dentry); + fsnotify_nameremove(dentry, isdir); ++ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) ++ dentry->d_op->d_unlink(dentry->d_parent, dentry); + return; + } + +diff -up linux-2.6.38.noarch/fs/exofs/exofs.h.orig linux-2.6.38.noarch/fs/exofs/exofs.h +--- linux-2.6.38.noarch/fs/exofs/exofs.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/exofs/exofs.h 2011-03-26 07:57:44.225821735 -0400 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -303,4 +303,21 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode, u64 data); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo, u64 todo_data); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int ++exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++exofs_recall_fn todo, u64 todo_data) ++{ ++ return todo(inode, todo_data); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.38.noarch/fs/exofs/export.c.orig linux-2.6.38.noarch/fs/exofs/export.c +--- linux-2.6.38.noarch/fs/exofs/export.c.orig 2011-03-26 07:57:44.226821719 -0400 ++++ linux-2.6.38.noarch/fs/exofs/export.c 2011-03-26 07:57:44.226821719 -0400 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct nfs4_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo, u64 todo_data) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode, todo_data); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.38.noarch/fs/exofs/inode.c.orig linux-2.6.38.noarch/fs/exofs/inode.c +--- linux-2.6.38.noarch/fs/exofs/inode.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/exofs/inode.c 2011-03-26 07:57:44.227821703 -0400 +@@ -820,8 +820,9 @@ static inline int exofs_inode_is_fast_sy + const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); + +-static int _do_truncate(struct inode *inode, loff_t newsize) ++static int _do_truncate(struct inode *inode, u64 data) + { ++ loff_t newsize = data; + struct exofs_i_info *oi = exofs_i(inode); + int ret; + +@@ -858,7 +859,8 @@ int exofs_setattr(struct dentry *dentry, + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { +- error = _do_truncate(inode, iattr->ia_size); ++ error = exofs_inode_recall_layout(inode, IOMODE_ANY, ++ _do_truncate, iattr->ia_size); + if (unlikely(error)) + return error; + } +@@ -971,6 +973,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.38.noarch/fs/exofs/Kbuild.orig linux-2.6.38.noarch/fs/exofs/Kbuild +--- linux-2.6.38.noarch/fs/exofs/Kbuild.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/exofs/Kbuild 2011-03-26 07:57:44.224821753 -0400 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.38.noarch/fs/exofs/Kconfig.orig linux-2.6.38.noarch/fs/exofs/Kconfig +--- linux-2.6.38.noarch/fs/exofs/Kconfig.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/exofs/Kconfig 2011-03-26 07:57:44.224821753 -0400 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.38.noarch/fs/exofs/super.c.orig linux-2.6.38.noarch/fs/exofs/super.c +--- linux-2.6.38.noarch/fs/exofs/super.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/exofs/super.c 2011-03-26 07:57:44.229821686 -0400 +@@ -627,6 +627,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.38.noarch/fs/exportfs/expfs.c.orig linux-2.6.38.noarch/fs/exportfs/expfs.c +--- linux-2.6.38.noarch/fs/exportfs/expfs.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/exportfs/expfs.c 2011-03-26 07:57:44.230821684 -0400 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.38.noarch/fs/exportfs/Makefile.orig linux-2.6.38.noarch/fs/exportfs/Makefile +--- linux-2.6.38.noarch/fs/exportfs/Makefile.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/exportfs/Makefile 2011-03-26 07:57:44.229821686 -0400 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.38.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.38.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.38.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2011-03-26 07:57:44.230821684 -0400 ++++ linux-2.6.38.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2011-03-26 07:57:44.230821684 -0400 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.38.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.38.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.38.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2011-03-26 07:57:44.231821680 -0400 ++++ linux-2.6.38.noarch/fs/exportfs/nfs4filelayoutxdr.c 2011-03-26 07:57:44.231821680 -0400 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.38.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.38.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.38.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2011-03-26 07:57:44.232821674 -0400 ++++ linux-2.6.38.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2011-03-26 07:57:44.232821674 -0400 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.38.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.38.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.38.noarch/fs/gfs2/ops_fstype.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/gfs2/ops_fstype.c 2011-03-26 07:57:44.233821664 -0400 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1107,6 +1108,9 @@ static int fill_super(struct super_block + sb->s_op = &gfs2_super_ops; + sb->s_d_op = &gfs2_dops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.38.noarch/fs/Kconfig.orig linux-2.6.38.noarch/fs/Kconfig +--- linux-2.6.38.noarch/fs/Kconfig.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/Kconfig 2011-03-26 07:57:44.221821816 -0400 +@@ -49,6 +49,28 @@ config FS_POSIX_ACL + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ + config FILE_LOCKING + bool "Enable POSIX file locking API" if EXPERT + default y +diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.38.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2011-03-26 07:57:44.235821643 -0400 ++++ linux-2.6.38.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2011-03-26 07:57:44.235821643 -0400 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2011-03-26 07:57:44.237821622 -0400 ++++ linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.c 2011-03-26 07:57:44.237821622 -0400 +@@ -0,0 +1,1146 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_read_done(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_writeback_done(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct pnfs_layout_range *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout_hdr(struct pnfs_layout_hdr *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_hdr * ++bl_alloc_layout_hdr(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->range.offset & mask; ++ ++ arg->range.offset -= offset; ++ arg->range.length += offset + mask; ++ arg->range.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_data *lcdata) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); ++ kfree(lcdata->args.layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied from the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct nfs4_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = nfs4_proc_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = nfs4_proc_getdevicelist(server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) { ++ status = -ENODEV; ++ goto out_error; ++ } ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_clear_layoutdriver(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout_hdr = bl_alloc_layout_hdr, ++ .free_layout_hdr = bl_free_layout_hdr, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .set_layoutdriver = bl_set_layoutdriver, ++ .clear_layoutdriver = bl_clear_layoutdriver, ++ .pg_test = bl_pg_test, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ int ret; ++ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ ret = pnfs_register_layoutdriver(&blocklayout_type); ++ if (!ret) ++ bl_pipe_init(); ++ return ret; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2011-03-26 07:57:44.238821614 -0400 ++++ linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2011-03-26 07:57:44.238821614 -0400 +@@ -0,0 +1,334 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, ++ struct nfs4_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->range.iomode, ++ .start = lgr->range.offset >> 9, ++ .inval = lgr->range.offset >> 9, ++ .cowread = lgr->range.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->range.offset + lgr->range.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2011-03-26 07:57:44.239821607 -0400 ++++ linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2011-03-26 07:57:44.239821607 -0400 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2011-03-26 07:57:44.237821622 -0400 ++++ linux-2.6.38.noarch/fs/nfs/blocklayout/blocklayout.h 2011-03-26 07:57:44.238821614 -0400 +@@ -0,0 +1,302 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include /* Needed for struct dm_ioctl*/ ++#include "../pnfs.h" ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct nfs4_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct nfs4_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct nfs4_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_hdr bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->pls_layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.38.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.38.noarch/fs/nfs/blocklayout/extents.c.orig 2011-03-26 07:57:44.240821600 -0400 ++++ linux-2.6.38.noarch/fs/nfs/blocklayout/extents.c 2011-03-26 07:57:44.240821600 -0400 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->range.offset >> 9; ++ end = start + (arg->range.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.38.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.38.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.38.noarch/fs/nfs/blocklayout/Makefile.orig 2011-03-26 07:57:44.235821643 -0400 ++++ linux-2.6.38.noarch/fs/nfs/blocklayout/Makefile 2011-03-26 07:57:44.235821643 -0400 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.38.noarch/fs/nfs/callback.h.orig linux-2.6.38.noarch/fs/nfs/callback.h +--- linux-2.6.38.noarch/fs/nfs/callback.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/callback.h 2011-03-26 07:57:44.241821592 -0400 +@@ -167,6 +167,26 @@ extern unsigned nfs4_callback_layoutreca + + extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); + extern void nfs4_cb_take_slot(struct nfs_client *clp); ++ ++struct cb_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct nfs4_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern __be32 nfs4_callback_devicenotify( ++ struct cb_devicenotifyargs *args, ++ void *dummy, struct cb_process_state *cps); ++ + #endif /* CONFIG_NFS_V4_1 */ + extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, +diff -up linux-2.6.38.noarch/fs/nfs/callback_proc.c.orig linux-2.6.38.noarch/fs/nfs/callback_proc.c +--- linux-2.6.38.noarch/fs/nfs/callback_proc.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/callback_proc.c 2011-03-26 07:57:44.241821592 -0400 +@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct + spin_lock(&ino->i_lock); + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + mark_matching_lsegs_invalid(lo, &free_me_list, +- args->cbl_range.iomode)) ++ &args->cbl_range)) + rv = NFS4ERR_DELAY; + else + rv = NFS4ERR_NOMATCHING_LAYOUT; +@@ -184,14 +184,14 @@ static u32 initiate_bulk_draining(struct + ino = lo->plh_inode; + spin_lock(&ino->i_lock); + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); +- if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode)) ++ if (mark_matching_lsegs_invalid(lo, &free_me_list, &range)) + rv = NFS4ERR_DELAY; + list_del_init(&lo->plh_bulk_recall); + spin_unlock(&ino->i_lock); ++ pnfs_free_lseg_list(&free_me_list); + put_layout_hdr(lo); + iput(ino); + } +- pnfs_free_lseg_list(&free_me_list); + return rv; + } + +@@ -241,6 +241,36 @@ static void pnfs_recall_all_layouts(stru + do_callback_layoutrecall(clp, &args); + } + ++__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, ++ void *dummy, struct cb_process_state *cps) ++{ ++ int i; ++ u32 type, res = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (!cps->clp) { ++ res = NFS4ERR_OP_NOT_IN_SESSION; ++ goto out; ++ } ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && cps->clp->cl_devid_cache) ++ pnfs_delete_deviceid(cps->clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ ++out: ++ dprintk("%s: exit with status = %u\n", ++ __func__, res); ++ return cpu_to_be32(res); ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) +diff -up linux-2.6.38.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.38.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.38.noarch/fs/nfs/callback_xdr.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/callback_xdr.c 2011-03-26 07:57:44.242821583 -0400 +@@ -25,6 +25,7 @@ + + #if defined(CONFIG_NFS_V4_1) + #define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -284,6 +285,93 @@ out: + return status; + } + ++static ++__be32 decode_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: + case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] + (callback_decode_arg_t)decode_layoutrecall_args, + .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, + }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)nfs4_callback_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.38.noarch/fs/nfs/client.c.orig linux-2.6.38.noarch/fs/nfs/client.c +--- linux-2.6.38.noarch/fs/nfs/client.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/client.c 2011-03-26 07:57:44.244821565 -0400 +@@ -404,7 +404,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -418,6 +418,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* Common match routine for v4.0 and v4.1 callback services */ + bool +@@ -567,6 +568,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -889,7 +891,7 @@ error: + /* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -919,7 +921,9 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +- set_pnfs_layoutdriver(server, fsinfo->layouttype); ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); + + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + +@@ -965,7 +969,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1355,7 +1359,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1398,6 +1402,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +diff -up linux-2.6.38.noarch/fs/nfsd/bl_com.c.orig linux-2.6.38.noarch/fs/nfsd/bl_com.c +--- linux-2.6.38.noarch/fs/nfsd/bl_com.c.orig 2011-03-26 07:57:44.279821268 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/bl_com.c 2011-03-26 07:57:44.279821268 -0400 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.38.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.38.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.38.noarch/fs/nfsd/bl_ops.c.orig 2011-03-26 07:57:44.281821252 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/bl_ops.c 2011-03-26 07:57:44.281821252 -0400 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_fop->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = blkdev_get_by_dev(devid, FMODE_READ, NULL); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_fop->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.38.noarch/fs/nfsd/export.c.orig linux-2.6.38.noarch/fs/nfsd/export.c +--- linux-2.6.38.noarch/fs/nfsd/export.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/export.c 2011-03-26 07:57:44.282821243 -0400 +@@ -16,11 +16,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -348,10 +356,84 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); + ++static int pnfsd_check_export(struct inode *inode, int *flags) ++{ ++#if defined(CONFIG_PNFSD) ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#endif /* CONFIG_PNFSD */ ++ ++ return 0; ++} ++ + static int check_export(struct inode *inode, int *flags, unsigned char *uuid) + { + +@@ -391,8 +473,17 @@ static int check_export(struct inode *in + return -EINVAL; + } + +- return 0; ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* !CONFIG_SPNFS */ + ++ return pnfsd_check_export(inode, flags); + } + + #ifdef CONFIG_NFSD_V4 +@@ -582,6 +673,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -656,6 +749,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -683,6 +778,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -695,6 +791,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1662,8 +1759,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1691,6 +1797,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.38.noarch/fs/nfs/dir.c.orig linux-2.6.38.noarch/fs/nfs/dir.c +--- linux-2.6.38.noarch/fs/nfs/dir.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/dir.c 2011-03-26 07:57:44.245821557 -0400 +@@ -1161,19 +1161,22 @@ static void nfs_dentry_iput(struct dentr + if (S_ISDIR(inode->i_mode)) + /* drop any readdir cache as it could easily be old */ + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; +- +- if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { ++ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + drop_nlink(inode); +- nfs_complete_unlink(dentry, inode); +- } + iput(inode); + } + ++static void nfs_d_unlink(struct dentry *parent, struct dentry *dentry) ++{ ++ nfs_complete_unlink(parent, dentry); ++} ++ + const struct dentry_operations nfs_dentry_operations = { + .d_revalidate = nfs_lookup_revalidate, + .d_delete = nfs_dentry_delete, + .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, ++ .d_unlink = nfs_d_unlink, + }; + + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +@@ -1248,6 +1251,7 @@ const struct dentry_operations nfs4_dent + .d_delete = nfs_dentry_delete, + .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, ++ .d_unlink = nfs_d_unlink, + }; + + /* +diff -up linux-2.6.38.noarch/fs/nfs/direct.c.orig linux-2.6.38.noarch/fs/nfs/direct.c +--- linux-2.6.38.noarch/fs/nfs/direct.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/direct.c 2011-03-26 07:57:44.246821549 -0400 +@@ -271,6 +271,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -287,7 +319,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -348,26 +379,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -460,12 +474,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -499,25 +516,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -560,10 +559,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -592,16 +612,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -703,6 +714,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -718,7 +759,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -785,24 +825,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.38.noarch/fs/nfsd/Kconfig.orig linux-2.6.38.noarch/fs/nfsd/Kconfig +--- linux-2.6.38.noarch/fs/nfsd/Kconfig.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/Kconfig 2011-03-26 07:57:44.278821276 -0400 +@@ -91,3 +91,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.38.noarch/fs/nfsd/Makefile.orig linux-2.6.38.noarch/fs/nfsd/Makefile +--- linux-2.6.38.noarch/fs/nfsd/Makefile.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/Makefile 2011-03-26 07:57:44.279821268 -0400 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.38.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.38.noarch/fs/nfsd/nfs4callback.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfs4callback.c 2011-03-26 07:57:44.284821225 -0400 +@@ -39,6 +39,8 @@ + + #define NFSDDBG_FACILITY NFSDDBG_PROC + ++static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); ++ + #define NFSPROC4_CB_NULL 0 + #define NFSPROC4_CB_COMPOUND 1 + +@@ -48,6 +50,8 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -73,6 +77,19 @@ enum { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + struct nfs4_cb_compound_hdr { + /* args */ +@@ -361,6 +378,151 @@ static void encode_cb_recall4args(struct + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++/* ++ * CB_LAYOUTRECALL4args ++ * ++ * struct layoutrecall_file4 { ++ * nfs_fh4 lor_fh; ++ * offset4 lor_offset; ++ * length4 lor_length; ++ * stateid4 lor_stateid; ++ * }; ++ * ++ * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) { ++ * case LAYOUTRECALL4_FILE: ++ * layoutrecall_file4 lor_layout; ++ * case LAYOUTRECALL4_FSID: ++ * fsid4 lor_fsid; ++ * case LAYOUTRECALL4_ALL: ++ * void; ++ * }; ++ * ++ * struct CB_LAYOUTRECALL4args { ++ * layouttype4 clora_type; ++ * layoutiomode4 clora_iomode; ++ * bool clora_changed; ++ * layoutrecall4 clora_recall; ++ * }; ++ */ ++static void encode_cb_layout4args(struct xdr_stream *xdr, ++ const struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ p = xdr_reserve_space(xdr, 5 * 4); ++ *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL); ++ *p++ = cpu_to_be32(clr->cb.cbl_seg.layout_type); ++ *p++ = cpu_to_be32(clr->cb.cbl_seg.iomode); ++ *p++ = cpu_to_be32(clr->cb.cbl_layoutchanged); ++ *p = cpu_to_be32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ p = xdr_reserve_space(xdr, 2 * 8); ++ p = xdr_encode_hyper(p, fsid.major); ++ xdr_encode_hyper(p, fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ p = xdr_reserve_space(xdr, 4 + len + 2 * 8); ++ *p++ = cpu_to_be32(len); ++ xdr_encode_opaque_fixed(p, clr->clr_file->fi_fhval, len); ++ p += XDR_QUADLEN(len); ++ p = xdr_encode_hyper(p, clr->cb.cbl_seg.offset); ++ xdr_encode_hyper(p, clr->cb.cbl_seg.length); ++ encode_stateid4(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++/* ++ * CB_NOTIFY_DEVICEID4args ++ * ++ * typedef opaque notifylist4<>; ++ * ++ * struct notify4 { ++ * bitmap4 notify_mask; ++ * notifylist4 notify_vals; ++ * }; ++ * ++ * struct CB_NOTIFY_DEVICEID4args { ++ * notify4 cnda_changes<>; ++ * }; ++ */ ++static void encode_cb_device4args(struct xdr_stream *xdr, ++ const struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ p = xdr_reserve_space(xdr, 2 * 4); ++ *p++ = cpu_to_be32(OP_CB_NOTIFY_DEVICEID); ++ /* notify4 cnda_changes<>; */ ++ *p = cpu_to_be32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ p = xdr_reserve_space(xdr, 4 * 4 + 2 * 8); ++ /* bitmap4 notify_mask; */ ++ *p++ = cpu_to_be32(1); ++ *p++ = cpu_to_be32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ *p++ = cpu_to_be32(24); ++ else ++ *p++ = cpu_to_be32(20); ++ *p++ = cpu_to_be32(cbd[i].cbd_layout_type); ++ p = xdr_encode_hyper(p, cbd[i].cbd_devid.sbid); ++ xdr_encode_hyper(p, cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = xdr_reserve_space(xdr, 4); ++ *p = cpu_to_be32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * CB_SEQUENCE4args + * +@@ -460,6 +622,8 @@ static int decode_cb_sequence4resok(stru + */ + status = 0; + out: ++ if (status) ++ nfsd4_mark_cb_fault(cb->cb_clp, status); + return status; + out_overflow: + print_overflow_msg(__func__, xdr); +@@ -523,6 +687,39 @@ static void nfs4_xdr_enc_cb_recall(struc + encode_cb_nops(&hdr); + } + ++#if defined(CONFIG_PNFSD) ++static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ const struct nfsd4_callback *cb) ++{ ++ const struct nfs4_layoutrecall *args = cb->cb_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = cb->cb_minorversion, ++ }; ++ ++ encode_cb_compound4args(xdr, &hdr); ++ encode_cb_sequence4args(xdr, cb, &hdr); ++ encode_cb_layout4args(xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++} ++ ++static void nfs4_xdr_enc_cb_device(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ const struct nfsd4_callback *cb) ++{ ++ struct nfs4_notify_device *args = cb->cb_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = cb->cb_minorversion, ++ }; ++ ++ encode_cb_compound4args(xdr, &hdr); ++ encode_cb_sequence4args(xdr, cb, &hdr); ++ encode_cb_device4args(xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++} ++#endif /* CONFIG_PNFSD */ + + /* + * NFSv4.0 and NFSv4.1 XDR decode functions +@@ -569,6 +766,58 @@ out: + return status; + } + ++#if defined(CONFIG_PNFSD) ++static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfsd4_callback *cb) ++{ ++ struct nfs4_cb_compound_hdr hdr; ++ enum nfsstat4 nfserr; ++ int status; ++ ++ status = decode_cb_compound4res(xdr, &hdr); ++ if (unlikely(status)) ++ goto out; ++ if (cb) { ++ status = decode_cb_sequence4res(xdr, cb); ++ if (unlikely(status)) ++ goto out; ++ } ++ status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); ++ if (unlikely(status)) ++ goto out; ++ if (unlikely(nfserr != NFS4_OK)) ++ status = nfs_cb_stat_to_errno(nfserr); ++out: ++ return status; ++} ++ ++static int nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfsd4_callback *cb) ++{ ++ struct nfs4_cb_compound_hdr hdr; ++ enum nfsstat4 nfserr; ++ int status; ++ ++ status = decode_cb_compound4res(xdr, &hdr); ++ if (unlikely(status)) ++ goto out; ++ if (cb) { ++ status = decode_cb_sequence4res(xdr, cb); ++ if (unlikely(status)) ++ goto out; ++ } ++ status = decode_cb_op_status(xdr, OP_CB_NOTIFY_DEVICEID, &nfserr); ++ if (unlikely(status)) ++ goto out; ++ if (unlikely(nfserr != NFS4_OK)) ++ status = nfs_cb_stat_to_errno(nfserr); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -586,6 +835,10 @@ out: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, cb_null, cb_null), + PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), ++ PROC(CB_DEVICE, COMPOUND, cb_device, cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -686,6 +939,12 @@ static void nfsd4_mark_cb_down(struct nf + warn_no_callback_path(clp, reason); + } + ++static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) ++{ ++ clp->cl_cb_state = NFSD4_CB_FAULT; ++ warn_no_callback_path(clp, reason); ++} ++ + static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) + { + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); +@@ -783,11 +1042,10 @@ static bool nfsd41_cb_get_slot(struct nf + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfsd4_callback *cb, ++ struct nfs4_client *clp) + { +- struct nfsd4_callback *cb = calldata; +- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); +- struct nfs4_client *clp = dp->dl_client; + u32 minorversion = clp->cl_minorversion; + + cb->cb_minorversion = minorversion; +@@ -805,12 +1063,17 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); +- struct nfs4_client *clp = dp->dl_client; + ++ nfsd4_cb_prepare_sequence(task, cb, dp->dl_client); ++} ++ ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_minorversion); + +@@ -821,9 +1084,6 @@ static void nfsd4_cb_done(struct rpc_tas + rpc_wake_up_next(&clp->cl_cb_waitq); + dprintk("%s: freed slot, new seqid=%d\n", __func__, + clp->cl_cb_session->se_cb_seq_nr); +- +- /* We're done looking into the sequence information */ +- task->tk_msg.rpc_resp = NULL; + } + } + +@@ -835,7 +1095,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client != task->tk_client) { + /* We're shutting down or changing cl_cb_client; leave +@@ -884,7 +1144,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -1024,3 +1284,188 @@ void nfsd4_cb_recall(struct nfs4_delegat + + run_nfsd4_cb(&dp->dl_recall); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall); ++ ++ nfsd4_cb_prepare_sequence(task, cb, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall); ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *current_rpc_client = clp->cl_cb_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (current_rpc_client != task->tk_client) { ++ /* We're shutting down or changing cl_cb_client; leave ++ * it to nfsd4_process_cb_update to restart the call if ++ * necessary. */ ++ return; ++ } ++ ++ if (cb->cb_done) ++ return; ++ ++ if (task->tk_status) ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case 0: ++ goto done; ++ ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ goto done; ++ ++ case -NFS4ERR_DELAY: ++ /* Poll the client until it's done with the layout */ ++ /* FIXME: cap number of retries. ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ return; ++ ++ case -NFS4ERR_BADHANDLE: ++ /* FIXME: handle more gracefully */ ++ goto done; ++ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_INVAL: ++ case -NFS4ERR_NOTSUPP: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_RETRY_UNCACHED_REP: ++ case -NFS4ERR_TOO_MANY_OPS: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ /* We should never get these, yet it could be a result of a ++ * buggy client, therefore no BUG here. ++ */ ++ goto done; ++ ++ default: ++ break; ++ } ++ ++ /* Network partition? */ ++ nfsd4_mark_cb_down(clp, task->tk_status); ++done: ++ cb->cb_done = true; ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall); ++ ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++void ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_callback *cb = &clr->clr_recall; ++ ++ cb->cb_op = clr; ++ cb->cb_clp = clr->clr_client; ++ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT]; ++ cb->cb_msg.rpc_argp = cb; ++ cb->cb_msg.rpc_resp = cb; ++ cb->cb_msg.rpc_cred = callback_cred; ++ ++ cb->cb_ops = &nfsd4_cb_layout_ops; ++ run_nfsd4_cb(cb); ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall); ++ ++ nfsd4_cb_prepare_sequence(task, cb, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall); ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ nfsd4_mark_cb_down(clp, task->tk_status); ++ } ++ cb->cb_done = true; ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall); ++ ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++void ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfsd4_callback *cb = &cbnd->nd_recall; ++ ++ cb->cb_op = cbnd; ++ cb->cb_clp = cbnd->nd_client; ++ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE]; ++ cb->cb_msg.rpc_argp = cb; ++ cb->cb_msg.rpc_resp = cb; ++ cb->cb_msg.rpc_cred = callback_cred; ++ ++ cb->cb_ops = &nfsd4_cb_device_ops; ++ run_nfsd4_cb(cb); ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.38.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.38.noarch/fs/nfsd/nfs4pnfsd.c.orig 2011-03-26 07:57:44.286821208 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfs4pnfsd.c 2011-03-26 07:57:44.286821208 -0400 +@@ -0,0 +1,1688 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto verified; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++ } ++verified: ++ status = 0; ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++#define update_layout_stateid(ls, sid) { \ ++ update_stateid(&(ls)->ls_stateid); \ ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", \ ++ __func__, (ls)->ls_stateid.si_generation, (ls)); \ ++ memcpy((sid), &(ls)->ls_stateid, sizeof(stateid_t)); \ ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg, ++ stateid_t *stateid) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ update_layout_stateid(ls, stateid); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ INIT_WORK(&clr->clr_recall.cb_work, nfsd4_do_callback_rpc); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg, &lgp->lg_sid); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_layout_state *ls) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ if (ls && layouts_found && lrp->lrs_present) ++ update_layout_stateid(ls, &lrp->lr_sid); ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, &ls); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp, ls); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++ if (ls) ++ put_layout_state(ls); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_layout_stateid(ls, lsid); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr, ++ NULL); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kzalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ INIT_WORK(&cbnd->nd_recall.cb_work, nfsd4_do_callback_rpc); ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct nfs4_client *nd_client; ++ unsigned int notify_num = 0; ++ int status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ nd_client = cbnd->nd_client; ++ nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(nd_client); ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.38.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.38.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2011-03-26 07:57:44.287821200 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfs4pnfsdlm.c 2011-03-26 07:57:44.287821200 -0400 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.38.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.38.noarch/fs/nfsd/nfs4pnfsds.c.orig 2011-03-26 07:57:44.288821192 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfs4pnfsds.c 2011-03-26 07:57:44.288821192 -0400 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.38.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.38.noarch/fs/nfsd/nfs4proc.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfs4proc.c 2011-03-26 07:57:44.289821184 -0400 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -846,7 +882,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -868,13 +903,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -965,6 +1036,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1355,6 +1726,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, + .op_name = "OP_SECINFO_NO_NAME", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.38.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.38.noarch/fs/nfsd/nfs4state.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfs4state.c 2011-03-26 07:57:44.291821168 -0400 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -59,8 +61,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -68,6 +68,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -85,11 +86,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -108,7 +119,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -119,7 +130,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -179,10 +190,16 @@ static void nfs4_file_get_access(struct + + static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) + { +- if (fp->fi_fds[oflag]) { +- fput(fp->fi_fds[oflag]); +- fp->fi_fds[oflag] = NULL; +- } ++ struct file *fd = fp->fi_fds[oflag]; ++ ++ if (!fd) ++ return; ++ ++ fp->fi_fds[oflag] = NULL; ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ ++ fput(fd); ++ nfs4_lock_state(); + } + + static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) +@@ -295,8 +312,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -321,6 +338,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -943,6 +961,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -960,6 +980,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_shutdown_callback(clp); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -972,6 +993,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -1063,6 +1091,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + spin_lock_init(&clp->cl_lock); +@@ -1114,7 +1147,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -1169,6 +1202,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) + { + switch (family) { +@@ -1317,8 +1368,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1514,6 +1569,13 @@ nfsd4_create_session(struct svc_rqst *rq + bool confirm_me = false; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1549,6 +1611,9 @@ nfsd4_create_session(struct svc_rqst *rq + goto out; + } + ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + confirm_me = true; + conf = unconf; + } else { +@@ -1791,8 +1856,14 @@ out: + + nfsd4_get_session(cstate->session); + atomic_inc(&clp->cl_refcount); +- if (clp->cl_cb_state == NFSD4_CB_DOWN) ++ switch (clp->cl_cb_state) { ++ case NFSD4_CB_DOWN: + seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; ++ break; ++ case NFSD4_CB_FAULT: ++ seq->status_flags |= SEQ4_STATUS_BACKCHANNEL_FAULT; ++ break; ++ } + } + kfree(conn); + spin_unlock(&client_lock); +@@ -2051,7 +2122,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -2068,6 +2139,16 @@ alloc_init_file(struct inode *ino) + fp->fi_lease = NULL; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -2076,7 +2157,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -2092,6 +2173,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -2113,6 +2195,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -2186,6 +2270,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -2227,6 +2314,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -2235,7 +2323,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -2253,6 +2341,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2787,7 +2887,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -3006,7 +3106,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -3016,6 +3116,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -3127,6 +3237,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -3214,13 +3342,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3495,11 +3619,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3529,26 +3650,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3565,7 +3666,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3594,7 +3695,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3725,6 +3826,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -4301,6 +4405,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4405,6 +4512,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + } + + void +diff -up linux-2.6.38.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.38.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.38.noarch/fs/nfsd/nfs4xdr.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfs4xdr.c 2011-03-26 07:57:44.294821141 -0400 +@@ -45,11 +45,16 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "idmap.h" + #include "acl.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + + #define NFSDDBG_FACILITY NFSDDBG_XDR +@@ -1279,6 +1284,138 @@ static __be32 nfsd4_decode_reclaim_compl + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1380,11 +1517,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2191,6 +2336,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2434,6 +2609,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2688,6 +2867,13 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++#endif /* CONFIG_SPNFS */ + nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); +@@ -3007,6 +3193,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3146,6 +3335,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3206,11 +3732,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.38.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.38.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.38.noarch/fs/nfsd/nfsctl.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfsctl.c 2011-03-26 07:57:44.295821132 -0400 +@@ -12,11 +12,16 @@ + #include + #include + #include ++#include + + #include "idmap.h" + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -51,6 +56,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -78,6 +86,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + #ifdef CONFIG_NFSD_DEPRECATED +@@ -102,6 +113,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1366,6 +1380,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1402,6 +1478,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1440,6 +1520,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1462,6 +1545,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1484,7 +1576,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.38.noarch/fs/nfsd/nfsd.h.orig linux-2.6.38.noarch/fs/nfsd/nfsd.h +--- linux-2.6.38.noarch/fs/nfsd/nfsd.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfsd.h 2011-03-26 07:57:44.296821123 -0400 +@@ -287,11 +287,22 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++#endif /* CONFIG_PNFSD */ + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.38.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.38.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.38.noarch/fs/nfsd/nfsfh.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfsfh.c 2011-03-26 07:57:44.297821114 -0400 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.38.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.38.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.38.noarch/fs/nfsd/nfsfh.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfsfh.h 2011-03-26 07:57:44.298821106 -0400 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -203,4 +204,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.38.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.38.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.38.noarch/fs/nfsd/nfssvc.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/nfssvc.c 2011-03-26 07:57:44.298821106 -0400 +@@ -116,7 +116,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.38.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.38.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.38.noarch/fs/nfsd/pnfsd.h.orig 2011-03-26 07:57:44.299821098 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/pnfsd.h 2011-03-26 07:57:44.299821098 -0400 +@@ -0,0 +1,146 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ /* nfsd internal */ ++ struct nfsd4_callback nd_recall; ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++void nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++void nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++extern bool is_inode_pnfsd_lexp(struct inode *); ++extern int pnfsd_lexp_recall_layout(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.38.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.38.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.38.noarch/fs/nfsd/pnfsd_lexp.c.orig 2011-03-26 07:57:44.300821090 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/pnfsd_lexp.c 2011-03-26 07:57:44.300821090 -0400 +@@ -0,0 +1,296 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static wait_queue_head_t lo_recall_wq; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ wake_up_all(&lo_recall_wq); ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ static bool init_once; ++ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++ ++ if (!init_once++) ++ init_waitqueue_head(&lo_recall_wq); ++} ++ ++bool ++is_inode_pnfsd_lexp(struct inode *inode) ++{ ++ return inode->i_sb->s_pnfs_op == &pnfsd_lexp_ops; ++} ++ ++static bool ++has_layout(struct nfs4_file *fp) ++{ ++ return !list_empty(&fp->fi_layouts); ++} ++ ++/* ++ * recalls the layout if needed and waits synchronously for its return ++ */ ++int ++pnfsd_lexp_recall_layout(struct inode *inode) ++{ ++ struct nfs4_file *fp; ++ struct nfsd4_pnfs_cb_layout cbl; ++ int status = 0; ++ ++ dprintk("%s: begin\n", __func__); ++ fp = find_file(inode); ++ BUG_ON(!fp); ++ ++ if (!has_layout(fp)) ++ goto out; ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* for now, always recall the whole layout */ ++ cbl.cbl_seg.iomode = IOMODE_ANY; ++ cbl.cbl_seg.offset = 0; ++ cbl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ while (has_layout(fp)) { ++ dprintk("%s: recalling layout\n", __func__); ++ status = nfsd_layout_recall_cb(inode->i_sb, inode, &cbl); ++ ++ switch (status) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: /* no matching layout */ ++ status = 0; ++ goto out; ++ default: ++ goto out; ++ } ++ ++ dprintk("%s: waiting status=%d\n", __func__, status); ++ status = wait_event_interruptible(lo_recall_wq, !has_layout(fp)); ++ if (status) ++ break; ++ } ++out: ++ put_nfs4_file(fp); ++ dprintk("%s: status=%d\n", __func__, status); ++ return status; ++} +diff -up linux-2.6.38.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.38.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.38.noarch/fs/nfsd/spnfs_com.c.orig 2011-03-26 07:57:44.301821082 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/spnfs_com.c 2011-03-26 07:57:44.301821082 -0400 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.38.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.38.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.38.noarch/fs/nfsd/spnfs_ops.c.orig 2011-03-26 07:57:44.302821074 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/spnfs_ops.c 2011-03-26 07:57:44.302821074 -0400 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.38.noarch/fs/nfsd/state.h.orig linux-2.6.38.noarch/fs/nfsd/state.h +--- linux-2.6.38.noarch/fs/nfsd/state.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/state.h 2011-03-26 07:57:44.303821066 -0400 +@@ -37,6 +37,7 @@ + + #include + #include ++#include + #include "nfsfh.h" + + typedef struct { +@@ -65,17 +66,6 @@ typedef struct { + (s)->si_fileid, \ + (s)->si_generation + +-struct nfsd4_callback { +- void *cb_op; +- struct nfs4_client *cb_clp; +- struct list_head cb_per_client; +- u32 cb_minorversion; +- struct rpc_message cb_msg; +- const struct rpc_call_ops *cb_ops; +- struct work_struct cb_work; +- bool cb_done; +-}; +- + struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; +@@ -245,6 +235,7 @@ struct nfs4_client { + #define NFSD4_CB_UP 0 + #define NFSD4_CB_UNKNOWN 1 + #define NFSD4_CB_DOWN 2 ++#define NFSD4_CB_FAULT 3 + int cl_cb_state; + struct nfsd4_callback cl_cb_null; + struct nfsd4_session *cl_cb_session; +@@ -265,6 +256,12 @@ struct nfs4_client { + unsigned long cl_cb_slot_busy; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -384,6 +381,14 @@ struct nfs4_file { + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + + /* XXX: for first cut may fall back on returning file that doesn't work +@@ -412,6 +417,15 @@ static inline struct file *find_any_file + return f->fi_fds[O_RDONLY]; + } + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -434,6 +448,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -486,6 +503,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -499,4 +544,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.38.noarch/fs/nfsd/vfs.c.orig linux-2.6.38.noarch/fs/nfsd/vfs.c +--- linux-2.6.38.noarch/fs/nfsd/vfs.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/vfs.c 2011-03-26 07:57:44.304821057 -0400 +@@ -36,7 +36,12 @@ + #ifdef CONFIG_NFSD_V4 + #include "acl.h" + #include "idmap.h" ++#include "pnfsd.h" ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -380,6 +385,16 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (is_inode_pnfsd_lexp(inode)) ++ pnfsd_lexp_recall_layout(inode); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + host_err = get_write_access(inode); +@@ -1685,6 +1700,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1751,7 +1771,27 @@ nfsd_rename(struct svc_rqst *rqstp, stru + } + if (host_err) + goto out_drop_write; ++ ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1791,6 +1831,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1814,6 +1859,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1830,6 +1886,26 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + host_err = vfs_rmdir(dirp, rdentry); + if (!host_err) + host_err = commit_metadata(fhp); ++ ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + out_drop_write: + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_put: +diff -up linux-2.6.38.noarch/fs/nfsd/xdr4.h.orig linux-2.6.38.noarch/fs/nfsd/xdr4.h +--- linux-2.6.38.noarch/fs/nfsd/xdr4.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfsd/xdr4.h 2011-03-26 07:57:44.305821048 -0400 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -390,6 +392,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -432,6 +479,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.38.noarch/fs/nfs/file.c.orig linux-2.6.38.noarch/fs/nfs/file.c +--- linux-2.6.38.noarch/fs/nfs/file.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/file.c 2011-03-26 07:57:44.247821541 -0400 +@@ -381,16 +381,16 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + +- pnfs_update_layout(mapping->host, +- nfs_file_open_context(file), +- IOMODE_RW); +- ++ lseg = pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ pos, len, IOMODE_RW); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -399,17 +399,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -418,6 +423,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -427,6 +438,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -453,10 +465,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -567,6 +586,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -577,8 +598,8 @@ static int nfs_vm_page_mkwrite(struct vm + goto out_unlock; + + ret = VM_FAULT_LOCKED; +- if (nfs_flush_incompatible(filp, page) == 0 && +- nfs_updatepage(filp, page, 0, pagelen) == 0) ++ if (nfs_flush_incompatible(filp, page, NULL) == 0 && ++ nfs_updatepage(filp, page, 0, pagelen, NULL, NULL) == 0) + goto out; + + ret = VM_FAULT_SIGBUS; +diff -up linux-2.6.38.noarch/fs/nfs/inode.c.orig linux-2.6.38.noarch/fs/nfs/inode.c +--- linux-2.6.38.noarch/fs/nfs/inode.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/inode.c 2011-03-26 07:57:44.248821533 -0400 +@@ -653,6 +653,7 @@ struct nfs_open_context *get_nfs_open_co + atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { +@@ -1016,6 +1017,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1225,6 +1227,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1423,9 +1433,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_evict_inode(struct inode *inode) + { +- pnfs_destroy_layout(NFS_I(inode)); ++ pnfs_return_layout(inode, NULL, true); + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); ++ pnfs_destroy_layout(NFS_I(inode)); + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); + /* First call standard NFS clear_inode() code */ +diff -up linux-2.6.38.noarch/fs/nfs/internal.h.orig linux-2.6.38.noarch/fs/nfs/internal.h +--- linux-2.6.38.noarch/fs/nfs/internal.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/internal.h 2011-03-26 07:57:44.249821524 -0400 +@@ -148,6 +148,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -213,6 +223,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -262,10 +274,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.38.noarch/fs/nfs/Kconfig.orig linux-2.6.38.noarch/fs/nfs/Kconfig +--- linux-2.6.38.noarch/fs/nfs/Kconfig.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/Kconfig 2011-03-26 07:57:44.233821664 -0400 +@@ -87,6 +87,34 @@ config NFS_V4_1 + config PNFS_FILE_LAYOUT + tristate + ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.38.noarch/fs/nfs/Makefile.orig linux-2.6.38.noarch/fs/nfs/Makefile +--- linux-2.6.38.noarch/fs/nfs/Makefile.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/Makefile 2011-03-26 07:57:44.234821653 -0400 +@@ -21,3 +21,6 @@ nfs-$(CONFIG_NFS_FSCACHE) += fscache.o f + + obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o + nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.38.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.38.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.38.noarch/fs/nfs/nfs4filelayout.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/nfs4filelayout.c 2011-03-26 07:57:44.251821506 -0400 +@@ -41,7 +41,7 @@ MODULE_AUTHOR("Dean Hildebrand nfs_client, + nfs4_fl_free_deviceid_callback); +@@ -66,6 +66,200 @@ filelayout_clear_layoutdriver(struct nfs + return 0; + } + ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * flseg->dsaddr->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %hu\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ + /* + * filelayout_check_layout() + * +@@ -87,13 +281,13 @@ filelayout_check_layout(struct pnfs_layo + dprintk("--> %s\n", __func__); + + if (fl->pattern_offset > lgr->range.offset) { +- dprintk("%s pattern_offset %lld to large\n", ++ dprintk("%s pattern_offset %lld too large\n", + __func__, fl->pattern_offset); + goto out; + } + +- if (fl->stripe_unit % PAGE_SIZE) { +- dprintk("%s Stripe unit (%u) not page aligned\n", ++ if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { ++ dprintk("%s Invalid stripe unit (%u)\n", + __func__, fl->stripe_unit); + goto out; + } +@@ -252,14 +446,229 @@ filelayout_free_lseg(struct pnfs_layout_ + _filelayout_free_lseg(fl); + } + ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ bool used_mds = false; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ FILELAYOUT_LSEG(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (indices_used[i] == NFS4_PNFS_MAX_MULTI_CNT) { ++ used_mds = true; ++ clone_list[i] = data; ++ } else { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ } ++ if (used_mds) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } else ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ ifdebug(FACILITY) ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ * ++ * By the time this is called, we know req->wb_lseg == prev->wb_lseg ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ u32 stripe_unit; ++ ++ if (!req->wb_lseg) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ stripe_unit = FILELAYOUT_LSEG(req->wb_lseg)->stripe_unit; ++ ++ do_div(p_stripe, stripe_unit); ++ do_div(r_stripe, stripe_unit); ++ ++ return (p_stripe == r_stripe); ++} ++ + static struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, ++ .flags = PNFS_USE_RPC_CODE, + .set_layoutdriver = filelayout_set_layoutdriver, + .clear_layoutdriver = filelayout_clear_layoutdriver, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, ++ .pg_test = filelayout_pg_test, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .commit = filelayout_commit, + }; + + static int __init nfs4filelayout_init(void) +diff -up linux-2.6.38.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.38.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.38.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/nfs4filelayoutdev.c 2011-03-26 07:57:44.252821497 -0400 +@@ -104,6 +104,109 @@ _data_server_lookup_locked(u32 ip_addr, + return NULL; + } + ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO ++ "ip:port %x:%hu is not a pNFS Data Server\n", ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %x:%hu is not a pNFS Data Server\n", ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ + static void + destroy_ds(struct nfs4_pnfs_ds *ds) + { +@@ -455,3 +558,72 @@ nfs4_fl_find_get_deviceid(struct nfs_cli + return (d == NULL) ? NULL : + container_of(d, struct nfs4_file_layout_dsaddr, deviceid); + } ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, flseg->dsaddr->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id!\n", ++ __func__); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(NFS_SERVER(lseg->pls_layout->plh_inode), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ return dsaddr->ds_list[ds_idx]; ++} +diff -up linux-2.6.38.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.38.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.38.noarch/fs/nfs/nfs4filelayout.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/nfs4filelayout.h 2011-03-26 07:57:44.252821497 -0400 +@@ -83,9 +83,15 @@ FILELAYOUT_LSEG(struct pnfs_layout_segme + generic_hdr); + } + ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ + extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); + extern void print_ds(struct nfs4_pnfs_ds *ds); + extern void print_deviceid(struct nfs4_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); + extern struct nfs4_file_layout_dsaddr * + nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); + struct nfs4_file_layout_dsaddr * +diff -up linux-2.6.38.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.38.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.38.noarch/fs/nfs/nfs4_fs.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/nfs4_fs.h 2011-03-26 07:57:44.250821515 -0400 +@@ -250,10 +250,12 @@ static inline struct nfs4_session *nfs4_ + } + + extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); +@@ -266,6 +268,7 @@ static inline struct nfs4_session *nfs4_ + } + + static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -283,7 +286,7 @@ extern const struct nfs4_minor_version_o + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -293,10 +296,10 @@ extern void nfs4_kill_renewd(struct nfs_ + extern void nfs4_renew_state(struct work_struct *); + + /* nfs4state.c */ ++struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); + struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); + struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); + #if defined(CONFIG_NFS_V4_1) +-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); + struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); + extern void nfs4_schedule_session_recovery(struct nfs4_session *); + #else +@@ -305,6 +308,17 @@ static inline void nfs4_schedule_session + } + #endif /* CONFIG_NFS_V4_1 */ + ++static inline struct rpc_cred * ++nfs4_get_machine_cred(struct nfs_client *clp) ++{ ++ struct rpc_cred *cred; ++ ++ spin_lock(&clp->cl_lock); ++ cred = nfs4_get_machine_cred_locked(clp); ++ spin_unlock(&clp->cl_lock); ++ return cred; ++} ++ + extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); + extern void nfs4_put_state_owner(struct nfs4_state_owner *); + extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); +diff -up linux-2.6.38.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.38.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.38.noarch/fs/nfs/nfs4proc.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/nfs4proc.c 2011-03-26 07:57:44.255821471 -0400 +@@ -70,7 +70,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -128,12 +128,13 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, + FATTR4_WORD1_TIME_DELTA +- | FATTR4_WORD1_FS_LAYOUT_TYPES ++ | FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -573,6 +574,7 @@ static int nfs41_setup_sequence(struct n + } + + int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, +@@ -581,6 +583,8 @@ int nfs4_setup_sequence(const struct nfs + struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; + if (session == NULL) { + args->sa_session = NULL; + res->sr_session = NULL; +@@ -611,7 +615,7 @@ static void nfs41_call_sync_prepare(stru + + dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); + +- if (nfs4_setup_sequence(data->seq_server, data->seq_args, ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -1398,7 +1402,7 @@ static void nfs4_open_prepare(struct rpc + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1573,9 +1577,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1591,6 +1594,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1679,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1897,7 +1901,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1949,7 +1953,7 @@ static void nfs4_close_prepare(struct rp + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -2269,6 +2273,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2596,7 +2603,7 @@ static int nfs4_proc_unlink_done(struct + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -2621,7 +2628,7 @@ static int nfs4_proc_rename_done(struct + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + + update_changeattr(old_dir, &res->old_cinfo); +@@ -3072,19 +3079,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3095,20 +3114,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; ++ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3121,21 +3176,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3145,6 +3221,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3490,9 +3572,10 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; ++ if (!clp) ++ clp = server->nfs_client; + + if (task->tk_status >= 0) + return 0; +@@ -3524,7 +3607,8 @@ nfs4_async_handle_error(struct rpc_task + return -EAGAIN; + #endif /* CONFIG_NFS_V4_1 */ + case -NFS4ERR_DELAY: +- nfs_inc_server_stats(server, NFSIOS_DELAY); ++ if (server) ++ nfs_inc_server_stats(server, NFSIOS_DELAY); + case -NFS4ERR_GRACE: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); +@@ -3537,6 +3621,8 @@ nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + wait_on_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); +@@ -3669,8 +3755,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3690,7 +3776,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3924,7 +4010,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3942,7 +4028,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4097,7 +4183,7 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server, ++ if (nfs4_setup_sequence(data->server, NULL, + &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; +@@ -5112,7 +5198,7 @@ int nfs4_init_session(struct nfs_server + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5381,7 +5467,7 @@ nfs4_layoutget_prepare(struct rpc_task * + * However, that is not so catastrophic, and there seems + * to be no way to prevent it completely. + */ +- if (nfs4_setup_sequence(server, &lgp->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, + &lgp->res.seq_res, 0, task)) + return; + if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, +@@ -5390,6 +5476,7 @@ nfs4_layoutget_prepare(struct rpc_task * + rpc_exit(task, NFS4_OK); + return; + } ++ + rpc_call_start(task); + } + +@@ -5398,11 +5485,16 @@ static void nfs4_layoutget_done(struct r + struct nfs4_layoutget *lgp = calldata; + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + +- dprintk("--> %s\n", __func__); ++ dprintk("--> %s: tk_status=%d\n", __func__, task->tk_status); + +- if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) { ++ /* layout code relies on fact that in this case ++ * code falls back to tk_action=call_start, but not ++ * back to rpc_prepare_task, to keep plh_outstanding ++ * correct. ++ */ + return; +- ++ } + switch (task->tk_status) { + case 0: + break; +@@ -5411,7 +5503,8 @@ static void nfs4_layoutget_done(struct r + task->tk_status = -NFS4ERR_DELAY; + /* Fall through */ + default: +- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) { ++ dprintk("<-- %s retrying\n", __func__); + rpc_restart_call_prepare(task); + return; + } +@@ -5477,6 +5570,241 @@ int nfs4_proc_layoutget(struct nfs4_layo + return status; + } + ++static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct nfs4_layoutcommit_data *ldata = ++ (struct nfs4_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ ldata->res.status = -1; ++ rpc_call_start(task); ++} ++ ++static void ++nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++} ++ ++static void nfs4_layoutcommit_release(void *lcdata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)lcdata; ++ ++ pnfs_cleanup_layoutcommit(data->args.inode, data); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout_hdr(NFS_I(data->args.inode)->layout); ++ put_rpccred(data->cred); ++ kfree(lcdata); ++} ++ ++static const struct rpc_call_ops nfs4_layoutcommit_ops = { ++ .rpc_call_prepare = nfs4_layoutcommit_prepare, ++ .rpc_call_done = nfs4_layoutcommit_done, ++ .rpc_release = nfs4_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++int ++nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.range.length, ++ data->args.range.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return status; ++} ++ ++static void ++nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct nfs_server *server; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (lrp->args.return_type == RETURN_FILE) ++ server = NFS_SERVER(lrp->args.inode); ++ else ++ server = NULL; ++ if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN) { ++ nfs_restart_rpc(task, lrp->clp); ++ return; ++ } ++ if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) { ++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; ++ ++ spin_lock(&lo->plh_inode->i_lock); ++ if (lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); ++ else ++ BUG_ON(!list_empty(&lo->plh_segs)); ++ spin_unlock(&lo->plh_inode->i_lock); ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ ++ dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type); ++ if (lrp->args.return_type == RETURN_FILE) { ++ struct inode *ino = lrp->args.inode; ++ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout; ++ ++ put_layout_hdr(lo); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_layoutreturn_prepare, ++ .rpc_call_done = nfs4_layoutreturn_done, ++ .rpc_release = nfs4_layoutreturn_release, ++}; ++ ++int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = lrp->clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); ++ rpc_put_task(task); ++ return status; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_getdevicelist_args args = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ .rpc_cred = nfs4_get_machine_cred(server->nfs_client), ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ put_rpccred(msg.rpc_cred); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("%s: err=%d, num_devs=%u\n", __func__, ++ err, devlist->num_devs); ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); ++ + static int + _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) + { +@@ -5490,11 +5818,13 @@ _nfs4_proc_getdeviceinfo(struct nfs_serv + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, ++ .rpc_cred = nfs4_get_machine_cred(server->nfs_client), + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ put_rpccred(msg.rpc_cred); + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +diff -up linux-2.6.38.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.38.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.38.noarch/fs/nfs/nfs4renewd.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/nfs4renewd.c 2011-03-26 07:57:44.257821455 -0400 +@@ -65,7 +65,7 @@ nfs4_renew_state(struct work_struct *wor + dprintk("%s: start\n", __func__); + + rcu_read_lock(); +- if (list_empty(&clp->cl_superblocks)) { ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) { + rcu_read_unlock(); + goto out; + } +diff -up linux-2.6.38.noarch/fs/nfs/nfs4state.c.orig linux-2.6.38.noarch/fs/nfs/nfs4state.c +--- linux-2.6.38.noarch/fs/nfs/nfs4state.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/nfs4state.c 2011-03-26 07:57:44.258821447 -0400 +@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +diff -up linux-2.6.38.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.38.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.38.noarch/fs/nfs/nfs4xdr.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/nfs4xdr.c 2011-03-26 07:57:44.261821422 -0400 +@@ -90,7 +90,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -112,7 +112,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -311,6 +315,17 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ ++ 2 /* nfs_cookie4 gdlr_cookie */ + \ ++ decode_verifier_maxsz \ ++ /* verifier4 gdlr_verifier */ + \ ++ 1 /* gdlr_deviceid_list count */ + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_DEVICEID4_SIZE) \ ++ /* gdlr_deviceid_list */ + \ ++ 1 /* bool gdlr_eof */) + #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) + #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ +@@ -324,6 +339,17 @@ static int nfs4_stat_to_errno(int); + #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_maxsz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -713,6 +739,14 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) + #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +@@ -727,6 +761,38 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_maxsz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_maxsz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -1031,6 +1097,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -1039,8 +1134,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1767,6 +1865,26 @@ static void encode_sequence(struct xdr_s + + #ifdef CONFIG_NFS_V4_1 + static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_getdevicelist_args *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++ hdr->replen += decode_getdevicelist_maxsz; ++} ++ ++static void + encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) +@@ -1812,6 +1930,102 @@ encode_layoutget(struct xdr_stream *xdr, + hdr->nops++; + hdr->replen += decode_layoutget_maxsz; + } ++ ++static void ++encode_layoutcommit(struct xdr_stream *xdr, ++ struct inode *inode, ++ const struct nfs4_layoutcommit_args *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->range.length, args->range.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { ++ NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( ++ NFS_I(inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ spin_lock(&args->inode->i_lock); ++ memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data, ++ NFS4_STATEID_SIZE); ++ spin_unlock(&args->inode->i_lock); ++ p = xdr_encode_opaque_fixed(p, &stateid.data, ++ NFS4_STATEID_SIZE); ++ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { ++ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#else /* CONFIG_NFS_V4_1 */ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ struct inode *inode, ++ const struct nfs4_layoutcommit_args *args, ++ struct compound_hdr *hdr) ++{ ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args, ++ struct compound_hdr *hdr) ++{ ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + /* +@@ -2408,7 +2622,7 @@ static void nfs4_xdr_enc_setclientid_con + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid_confirm(xdr, arg, &hdr); +@@ -2534,7 +2748,7 @@ static void nfs4_xdr_enc_get_lease_time( + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->la_seq_args, &hdr); +@@ -2561,6 +2775,24 @@ static void nfs4_xdr_enc_reclaim_complet + } + + /* ++ * Encode GETDEVICELIST request ++ */ ++static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs4_getdevicelist_args *args) ++{ ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, args->fh, &hdr); ++ encode_getdevicelist(xdr, args, &hdr); ++ encode_nops(&hdr); ++} ++ ++/* + * Encode GETDEVICEINFO request + */ + static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, +@@ -2601,6 +2833,81 @@ static void nfs4_xdr_enc_layoutget(struc + encode_layoutget(xdr, args, &hdr); + encode_nops(&hdr); + } ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs4_layoutcommit_args *args) ++{ ++ struct nfs4_layoutcommit_data *data = ++ container_of(args, struct nfs4_layoutcommit_data, args); ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, args->fh, &hdr); ++ encode_layoutcommit(xdr, data->args.inode, args, &hdr); ++ encode_getfattr(xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_args *args) ++{ ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(xdr, args, &hdr); ++ encode_nops(&hdr); ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static void nfs4_xdr_enc_dswrite(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs_writeargs *args) ++{ ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, args->fh, &hdr); ++ encode_write(xdr, args, &hdr); ++ encode_nops(&hdr); ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static void nfs4_xdr_enc_dscommit(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs_writeargs *args) ++{ ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, args->fh, &hdr); ++ encode_commit(xdr, args, &hdr); ++ encode_nops(&hdr); ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2701,14 +3008,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2740,8 +3050,9 @@ static int decode_attr_supported(struct + return ret; + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3794,7 +4105,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3820,7 +4131,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3852,7 +4163,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3994,7 +4305,7 @@ static int decode_getfattr_generic(struc + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + int status; + + status = decode_op_hdr(xdr, OP_GETATTR); +@@ -4080,10 +4391,32 @@ static int decode_attr_pnfstype(struct x + return status; + } + ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++ + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2]; ++ uint32_t attrlen, bitmap[3]; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -4111,6 +4444,9 @@ static int decode_fsinfo(struct xdr_stre + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + if (status != 0) + goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -4530,7 +4866,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4878,6 +5214,50 @@ out_overflow: + } + + #if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} + + static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct pnfs_device *pdev) +@@ -5003,6 +5383,56 @@ out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; + } ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct nfs4_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ res->status = status; ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} + #endif /* CONFIG_NFS_V4_1 */ + + /* +@@ -6019,6 +6449,32 @@ static int nfs4_xdr_dec_reclaim_complete + } + + /* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs4_getdevicelist_res *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* + * Decode GETDEVINFO response + */ + static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, +@@ -6062,6 +6518,108 @@ static int nfs4_xdr_dec_layoutget(struct + out: + return status; + } ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_res *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs4_layoutcommit_res *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs_writeres *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status) ++ goto out; ++ status = decode_write(xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs_writeres *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status) ++ goto out; ++ status = decode_commit(xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + /** +@@ -6081,7 +6639,7 @@ out: + int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) + { +- uint32_t bitmap[2] = {0}; ++ uint32_t bitmap[3] = {0}; + uint32_t len; + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) +@@ -6263,8 +6821,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.38.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.38.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.38.noarch/fs/nfs/objlayout/Kbuild.orig 2011-03-26 07:57:44.262821413 -0400 ++++ linux-2.6.38.noarch/fs/nfs/objlayout/Kbuild 2011-03-26 07:57:44.262821413 -0400 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.38.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.38.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.38.noarch/fs/nfs/objlayout/objio_osd.c.orig 2011-03-26 07:57:44.263821404 -0400 ++++ linux-2.6.38.noarch/fs/nfs/objlayout/objio_osd.c 2011-03-26 07:57:44.263821404 -0400 +@@ -0,0 +1,1060 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct nfs4_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct nfs4_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct nfs4_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct nfs4_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct nfs4_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = NFS_SERVER(pnfslay->plh_inode)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= REQ_WRITE; ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ ++ .set_layoutdriver = objlayout_set_layoutdriver, ++ .clear_layoutdriver = objlayout_clear_layoutdriver, ++ ++ .alloc_layout_hdr = objlayout_alloc_layout_hdr, ++ .free_layout_hdr = objlayout_free_layout_hdr, ++ ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ ++ .get_blocksize = objlayout_get_blocksize, ++ ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .commit = objlayout_commit, ++ ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ int ret = pnfs_register_layoutdriver(&objlayout_type); ++ ++ if (ret) ++ printk(KERN_INFO ++ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", ++ __func__, ret); ++ else ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return ret; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.c.orig 2011-03-26 07:57:44.265821386 -0400 ++++ linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.c 2011-03-26 07:57:44.265821386 -0400 +@@ -0,0 +1,773 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++struct pnfs_layout_hdr * ++objlayout_alloc_layout_hdr(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++void ++objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ objlseg = kzalloc(sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!objlseg) ++ goto err; ++ ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ objlseg->lseg.pls_range = lgr->range; ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, &objlseg->lseg); ++ return &objlseg->lseg; ++ ++ err: ++ kfree(objlseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = container_of(lseg, struct objlayout_segment, lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(objlseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = ++ container_of(lseg, struct objlayout_segment, lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->pls_range.offset); ++ lseg_end_offset = end_offset(lseg->pls_range.offset, lseg->pls_range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->pls_range.length - (offset - lseg->pls_range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->objlseg = objlseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.pls_layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))state->objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_commit_done(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_read_done(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_read_done(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_writeback_done(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_writeback_done(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = pnfslay->plh_inode->i_sb; ++ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Perform the objio specific init_mt method. ++ * Set the layout driver private data pointer for later use. ++ */ ++int ++objlayout_set_layoutdriver(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Perform the objio specific fini_mt method to release the ++ * layoutdriver private data. ++ */ ++int ++objlayout_clear_layoutdriver(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} +diff -up linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.h.orig 2011-03-26 07:57:44.265821386 -0400 ++++ linux-2.6.38.noarch/fs/nfs/objlayout/objlayout.h 2011-03-26 07:57:44.265821386 -0400 +@@ -0,0 +1,206 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include "../pnfs.h" ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ struct pnfs_layout_segment lseg; ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_hdr pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct objlayout_segment *objlseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.pls_layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++ ++extern int objlayout_set_layoutdriver( ++ struct nfs_server *, ++ const struct nfs_fh *); ++extern int objlayout_clear_layoutdriver(struct nfs_server *); ++ ++extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *); ++extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); ++ ++extern struct pnfs_layout_segment *objlayout_alloc_lseg( ++ struct pnfs_layout_hdr *, ++ struct nfs4_layoutget_res *); ++extern void objlayout_free_lseg(struct pnfs_layout_segment *); ++ ++extern enum pnfs_try_status objlayout_read_pagelist( ++ struct nfs_read_data *, ++ unsigned nr_pages); ++ ++extern enum pnfs_try_status objlayout_write_pagelist( ++ struct nfs_write_data *, ++ unsigned nr_pages, ++ int how); ++ ++extern enum pnfs_try_status objlayout_commit( ++ struct nfs_write_data *, ++ int how); ++ ++extern void objlayout_encode_layoutcommit( ++ struct pnfs_layout_hdr *, ++ struct xdr_stream *, ++ const struct nfs4_layoutcommit_args *); ++ ++extern void objlayout_encode_layoutreturn( ++ struct pnfs_layout_hdr *, ++ struct xdr_stream *, ++ const struct nfs4_layoutreturn_args *); ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2011-03-26 07:57:44.266821378 -0400 ++++ linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.c 2011-03-26 07:57:44.266821378 -0400 +@@ -0,0 +1,702 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->pls_range.offset; ++ mcs->length = lseg->pls_range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ ++ .set_layoutdriver = objlayout_set_layoutdriver, ++ .clear_layoutdriver = objlayout_clear_layoutdriver, ++ ++ .alloc_layout_hdr = objlayout_alloc_layout_hdr, ++ .free_layout_hdr = objlayout_free_layout_hdr, ++ ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ ++ .get_blocksize = panlayout_get_blocksize, ++ ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .commit = objlayout_commit, ++ ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ int ret = pnfs_register_layoutdriver(&panlayout_type); ++ ++ if (ret) ++ printk(KERN_INFO ++ "%s: Registering Panasas OSD pNFS Layout Driver failed: error=%d\n", ++ __func__, ret); ++ else ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return ret; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2011-03-26 07:57:44.267821370 -0400 ++++ linux-2.6.38.noarch/fs/nfs/objlayout/panfs_shim.h 2011-03-26 07:57:44.267821370 -0400 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.38.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.38.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.38.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2011-03-26 07:57:44.268821362 -0400 ++++ linux-2.6.38.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2011-03-26 07:57:44.268821362 -0400 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.38.noarch/fs/nfs/pagelist.c.orig linux-2.6.38.noarch/fs/nfs/pagelist.c +--- linux-2.6.38.noarch/fs/nfs/pagelist.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/pagelist.c 2011-03-26 07:57:44.269821354 -0400 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -53,7 +54,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -84,6 +86,9 @@ nfs_create_request(struct nfs_open_conte + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -159,9 +164,12 @@ void nfs_clear_request(struct nfs_page * + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -240,7 +248,8 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +@@ -254,6 +263,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -286,7 +301,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -375,6 +390,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -384,7 +400,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -415,6 +431,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.38.noarch/fs/nfs/pnfs.c.orig linux-2.6.38.noarch/fs/nfs/pnfs.c +--- linux-2.6.38.noarch/fs/nfs/pnfs.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/pnfs.c 2011-03-26 07:57:44.271821338 -0400 +@@ -30,6 +30,7 @@ + #include + #include "internal.h" + #include "pnfs.h" ++#include "iostat.h" + + #define NFSDBG_FACILITY NFSDBG_PNFS + +@@ -71,6 +72,52 @@ find_pnfs_driver(u32 id) + return local; + } + ++/* Set cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags)) { ++ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_LAYOUT_NEED_LCOMMIT, ++ &nfsi->layout->plh_flags); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->write_begin_pos) ++ nfsi->layout->write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->write_end_pos) ++ nfsi->layout->write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->write_begin_pos, ++ (unsigned long) nfsi->layout->write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ + void + unset_pnfs_layoutdriver(struct nfs_server *nfss) + { +@@ -88,7 +135,8 @@ unset_pnfs_layoutdriver(struct nfs_serve + * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + */ + void +-set_pnfs_layoutdriver(struct nfs_server *server, u32 id) ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) + { + struct pnfs_layoutdriver_type *ld_type = NULL; + +@@ -115,7 +163,7 @@ set_pnfs_layoutdriver(struct nfs_server + goto out_no_driver; + } + server->pnfs_curr_ld = ld_type; +- if (ld_type->set_layoutdriver(server)) { ++ if (ld_type->set_layoutdriver(server, mntfh)) { + printk(KERN_ERR + "%s: Error initializing mount point for layout driver %u.\n", + __func__, id); +@@ -146,6 +194,14 @@ pnfs_register_layoutdriver(struct pnfs_l + return status; + } + ++ if (!ld_type->read_pagelist || !ld_type->write_pagelist || ++ !ld_type->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return status; ++ } ++ + spin_lock(&pnfs_spinlock); + tmp = find_pnfs_driver_locked(ld_type->id); + if (!tmp) { +@@ -184,18 +240,35 @@ get_layout_hdr(struct pnfs_layout_hdr *l + atomic_inc(&lo->plh_refcount); + } + ++static struct pnfs_layout_hdr * ++pnfs_alloc_layout_hdr(struct inode *ino) ++{ ++ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; ++ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) : ++ kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); ++} ++ ++static void ++pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) ++{ ++ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; ++ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); ++} ++ + static void + destroy_layout_hdr(struct pnfs_layout_hdr *lo) + { + dprintk("%s: freeing layout cache %p\n", __func__, lo); + BUG_ON(!list_empty(&lo->plh_layouts)); + NFS_I(lo->plh_inode)->layout = NULL; +- kfree(lo); ++ pnfs_free_layout_hdr(lo); + } + + static void + put_layout_hdr_locked(struct pnfs_layout_hdr *lo) + { ++ assert_spin_locked(&lo->plh_inode->i_lock); ++ BUG_ON(atomic_read(&lo->plh_refcount) == 0); + if (atomic_dec_and_test(&lo->plh_refcount)) + destroy_layout_hdr(lo); + } +@@ -205,6 +278,7 @@ put_layout_hdr(struct pnfs_layout_hdr *l + { + struct inode *inode = lo->plh_inode; + ++ BUG_ON(atomic_read(&lo->plh_refcount) == 0); + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + destroy_layout_hdr(lo); + spin_unlock(&inode->i_lock); +@@ -225,64 +299,136 @@ static void free_lseg(struct pnfs_layout + { + struct inode *ino = lseg->pls_layout->plh_inode; + ++ BUG_ON(atomic_read(&lseg->pls_refcount) != 0); + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + /* Matched by get_layout_hdr in pnfs_insert_layout */ + put_layout_hdr(NFS_I(ino)->layout); + } + +-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg +- * could sleep, so must be called outside of the lock. +- * Returns 1 if object was removed, otherwise return 0. +- */ +-static int +-put_lseg_locked(struct pnfs_layout_segment *lseg, +- struct list_head *tmp_list) ++static void ++put_lseg_common(struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = lseg->pls_layout->plh_inode; ++ ++ BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); ++ list_del_init(&lseg->pls_list); ++ if (list_empty(&lseg->pls_layout->plh_segs)) { ++ set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); ++ /* Matched by initial refcount set in alloc_init_layout_hdr */ ++ put_layout_hdr_locked(lseg->pls_layout); ++ } ++ rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); ++} ++ ++void ++put_lseg(struct pnfs_layout_segment *lseg) + { ++ struct inode *ino; ++ ++ if (!lseg) ++ return; ++ + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); +- if (atomic_dec_and_test(&lseg->pls_refcount)) { +- struct inode *ino = lseg->pls_layout->plh_inode; ++ ino = lseg->pls_layout->plh_inode; ++ if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) { ++ put_lseg_common(lseg); ++ spin_unlock(&ino->i_lock); ++ free_lseg(lseg); ++ } ++} ++EXPORT_SYMBOL_GPL(put_lseg); + +- BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); +- list_del(&lseg->pls_list); +- if (list_empty(&lseg->pls_layout->plh_segs)) { +- struct nfs_client *clp; ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; + +- clp = NFS_SERVER(ino)->nfs_client; +- spin_lock(&clp->cl_lock); +- /* List does not take a reference, so no need for put here */ +- list_del_init(&lseg->pls_layout->plh_layouts); +- spin_unlock(&clp->cl_lock); +- clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags); +- } +- rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); +- list_add(&lseg->pls_list, tmp_list); +- return 1; +- } +- return 0; ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; + } + +-static bool +-should_free_lseg(u32 lseg_iomode, u32 recall_iomode) ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) + { +- return (recall_iomode == IOMODE_ANY || +- lseg_iomode == recall_iomode); ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; + } + +-/* Returns 1 if lseg is removed from list, 0 otherwise */ +-static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, +- struct list_head *tmp_list) ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) + { +- int rv = 0; ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} + +- if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { ++bool ++should_free_lseg(struct pnfs_layout_range *lseg_range, ++ struct pnfs_layout_range *recall_range) ++{ ++ return (recall_range->iomode == IOMODE_ANY || ++ lseg_range->iomode == recall_range->iomode) && ++ lo_seg_intersecting(lseg_range, recall_range); ++} ++ ++static bool mark_lseg_invalid(struct pnfs_layout_segment *lseg, ++ struct list_head *tmp_list) ++{ ++ bool rv; ++ ++ assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); ++ rv = test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags); ++ if (rv) { + /* Remove the reference keeping the lseg in the + * list. It will now be removed when all + * outstanding io is finished. + */ +- rv = put_lseg_locked(lseg, tmp_list); ++ dprintk("%s: lseg %p ref %d\n", __func__, lseg, ++ atomic_read(&lseg->pls_refcount)); ++ if (atomic_dec_and_test(&lseg->pls_refcount)) { ++ put_lseg_common(lseg); ++ list_add(&lseg->pls_list, tmp_list); ++ rv = true; ++ } + } ++ + return rv; + } + +@@ -292,18 +438,24 @@ static int mark_lseg_invalid(struct pnfs + int + mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, +- u32 iomode) ++ struct pnfs_layout_range *recall_range) + { + struct pnfs_layout_segment *lseg, *next; + int invalid = 0, removed = 0; + + dprintk("%s:Begin lo %p\n", __func__, lo); + ++ if (list_empty(&lo->plh_segs)) { ++ if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) ++ put_layout_hdr_locked(lo); ++ return 0; ++ } + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) +- if (should_free_lseg(lseg->pls_range.iomode, iomode)) { ++ if (should_free_lseg(&lseg->pls_range, recall_range)) { + dprintk("%s: freeing lseg %p iomode %d " + "offset %llu length %llu\n", __func__, +- lseg, lseg->pls_range.iomode, lseg->pls_range.offset, ++ lseg, lseg->pls_range.iomode, ++ lseg->pls_range.offset, + lseg->pls_range.length); + invalid++; + removed += mark_lseg_invalid(lseg, tmp_list); +@@ -312,11 +464,57 @@ mark_matching_lsegs_invalid(struct pnfs_ + return invalid - removed; + } + ++/* Returns false if there was nothing to do, true otherwise */ ++static bool ++pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg, *next; ++ bool rv = false; ++ ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ assert_spin_locked(&lo->plh_inode->i_lock); ++ if (list_empty(&lo->plh_segs)) { ++ if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) ++ put_layout_hdr_locked(lo); ++ return 0; ++ } ++ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) ++ if (should_free_lseg(&lseg->pls_range, range)) { ++ dprintk("%s: freeing lseg %p iomode %d " ++ "offset %llu length %llu\n", __func__, ++ lseg, lseg->pls_range.iomode, ++ lseg->pls_range.offset, ++ lseg->pls_range.length); ++ mark_lseg_invalid(lseg, tmp_list); ++ rv = true; ++ } ++ dprintk("%s:Return %d\n", __func__, rv); ++ return rv; ++} ++ ++/* note free_me must contain lsegs from a single layout_hdr */ + void + pnfs_free_lseg_list(struct list_head *free_me) + { + struct pnfs_layout_segment *lseg, *tmp; ++ struct pnfs_layout_hdr *lo; ++ ++ if (list_empty(free_me)) ++ return; ++ ++ lo = list_first_entry(free_me, struct pnfs_layout_segment, ++ pls_list)->pls_layout; + ++ if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { ++ struct nfs_client *clp; ++ ++ clp = NFS_SERVER(lo->plh_inode)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->plh_layouts); ++ spin_unlock(&clp->cl_lock); ++ } + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { + list_del(&lseg->pls_list); + free_lseg(lseg); +@@ -328,14 +526,17 @@ pnfs_destroy_layout(struct nfs_inode *nf + { + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { +- set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); +- mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); +- /* Matched by refcount set to 1 in alloc_init_layout_hdr */ +- put_layout_hdr_locked(lo); ++ lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ ++ mark_matching_lsegs_invalid(lo, &tmp_list, &range); + } + spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); +@@ -371,16 +572,14 @@ pnfs_set_layout_stateid(struct pnfs_layo + { + u32 oldseq, newseq; + ++ assert_spin_locked(&lo->plh_inode->i_lock); + oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid); + newseq = be32_to_cpu(new->stateid.seqid); + if ((int)(newseq - oldseq) > 0) { + memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid)); +- if (update_barrier) { +- u32 new_barrier = be32_to_cpu(new->stateid.seqid); +- +- if ((int)(new_barrier - lo->plh_barrier)) +- lo->plh_barrier = new_barrier; +- } else { ++ if (update_barrier) ++ lo->plh_barrier = be32_to_cpu(new->stateid.seqid); ++ else { + /* Because of wraparound, we want to keep the barrier + * "close" to the current seqids. It needs to be + * within 2**31 to count as "behind", so if it +@@ -403,6 +602,7 @@ pnfs_layoutgets_blocked(struct pnfs_layo + (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) + return true; + return lo->plh_block_lgets || ++ test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + (list_empty(&lo->plh_segs) && + (atomic_read(&lo->plh_outstanding) > lget)); +@@ -429,7 +629,7 @@ pnfs_choose_layoutget_stateid(nfs4_state + } else + memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data)); + spin_unlock(&lo->plh_inode->i_lock); +- dprintk("<-- %s\n", __func__); ++ dprintk("<-- %s status=%d\n", __func__, status); + return status; + } + +@@ -442,7 +642,7 @@ pnfs_choose_layoutget_stateid(nfs4_state + static struct pnfs_layout_segment * + send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_open_context *ctx, +- u32 iomode) ++ struct pnfs_layout_range *range) + { + struct inode *ino = lo->plh_inode; + struct nfs_server *server = NFS_SERVER(ino); +@@ -455,11 +655,11 @@ send_layoutget(struct pnfs_layout_hdr *l + lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); + if (lgp == NULL) + return NULL; +- lgp->args.minlength = NFS4_MAX_UINT64; ++ lgp->args.minlength = PAGE_CACHE_SIZE; ++ if (lgp->args.minlength > range->length) ++ lgp->args.minlength = range->length; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; +- lgp->args.range.iomode = iomode; +- lgp->args.range.offset = 0; +- lgp->args.range.length = NFS4_MAX_UINT64; ++ lgp->args.range = *range; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); +@@ -471,11 +671,82 @@ send_layoutget(struct pnfs_layout_hdr *l + nfs4_proc_layoutget(lgp); + if (!lseg) { + /* remember that LAYOUTGET failed and suspend trying */ +- set_bit(lo_fail_bit(iomode), &lo->plh_flags); ++ set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); + } + return lseg; + } + ++static int ++return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait) ++{ ++ struct nfs4_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ put_layout_hdr(NFS_I(ino)->layout); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = RETURN_FILE; ++ lrp->args.range = *range; ++ lrp->args.inode = ino; ++ lrp->clp = server->nfs_client; ++ ++ status = nfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ ++/* Initiates a LAYOUTRETURN(FILE) */ ++int ++_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ bool wait) ++{ ++ struct pnfs_layout_hdr *lo = NULL; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_range arg; ++ LIST_HEAD(tmp_list); ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; ++ } ++ /* Reference matched in nfs4_layoutreturn_release */ ++ get_layout_hdr(lo); ++ spin_unlock(&ino->i_lock); ++ pnfs_free_lseg_list(&tmp_list); ++ ++ if (layoutcommit_needed(nfsi)) { ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ status = return_layout(ino, &arg, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ + bool pnfs_roc(struct inode *ino) + { + struct pnfs_layout_hdr *lo; +@@ -559,10 +830,24 @@ bool pnfs_roc_drain(struct inode *ino, u + * are seen first. + */ + static s64 +-cmp_layout(u32 iomode1, u32 iomode2) ++cmp_layout(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) + { ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ + /* read > read/write */ +- return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); ++ return (int)(l2->iomode == IOMODE_READ) - ++ (int)(l1->iomode == IOMODE_READ); + } + + static void +@@ -576,7 +861,7 @@ pnfs_insert_layout(struct pnfs_layout_hd + + assert_spin_locked(&lo->plh_inode->i_lock); + list_for_each_entry(lp, &lo->plh_segs, pls_list) { +- if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) ++ if (cmp_layout(&lp->pls_range, &lseg->pls_range) > 0) + continue; + list_add_tail(&lseg->pls_list, &lp->pls_list); + dprintk("%s: inserted lseg %p " +@@ -606,7 +891,7 @@ alloc_init_layout_hdr(struct inode *ino) + { + struct pnfs_layout_hdr *lo; + +- lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); ++ lo = pnfs_alloc_layout_hdr(ino); + if (!lo) + return NULL; + atomic_set(&lo->plh_refcount, 1); +@@ -639,13 +924,13 @@ pnfs_find_alloc_layout(struct inode *ino + if (likely(nfsi->layout == NULL)) /* Won the race? */ + nfsi->layout = new; + else +- kfree(new); ++ pnfs_free_layout_hdr(new); + return nfsi->layout; + } + + /* + * iomode matching rules: +- * iomode lseg match ++ * range lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true +@@ -655,16 +940,28 @@ pnfs_find_alloc_layout(struct inode *ino + * READ RW true + */ + static int +-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) ++is_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) + { +- return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); ++ struct pnfs_layout_range range1; ++ ++ if ((range->iomode == IOMODE_RW && ++ lseg->pls_range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->pls_range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->pls_range, &range1); + } + + /* + * lookup range in layout + */ + static struct pnfs_layout_segment * +-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) ++pnfs_find_lseg(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) + { + struct pnfs_layout_segment *lseg, *ret = NULL; + +@@ -673,16 +970,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *l + assert_spin_locked(&lo->plh_inode->i_lock); + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && +- is_matching_lseg(lseg, iomode)) { +- ret = lseg; ++ is_matching_lseg(lseg, range)) { ++ ret = get_lseg(lseg); + break; + } +- if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) ++ if (cmp_layout(range, &lseg->pls_range) > 0) + break; + } + +- dprintk("%s:Return lseg %p ref %d\n", +- __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); ++ dprintk("%s:Return lseg %p ref %d valid %d\n", ++ __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0, ++ ret ? test_bit(NFS_LSEG_VALID, &ret->pls_flags) : 0); + return ret; + } + +@@ -693,12 +991,20 @@ pnfs_find_lseg(struct pnfs_layout_hdr *l + struct pnfs_layout_segment * + pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, + enum pnfs_iomode iomode) + { ++ struct pnfs_layout_range arg = { ++ .iomode = iomode, ++ .offset = pos, ++ .length = count, ++ }; + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg = NULL; ++ bool first = false; + + if (!pnfs_enabled_sb(NFS_SERVER(ino))) + return NULL; +@@ -715,21 +1021,25 @@ pnfs_update_layout(struct inode *ino, + dprintk("%s matches recall, use MDS\n", __func__); + goto out_unlock; + } +- /* Check to see if the layout for the given range already exists */ +- lseg = pnfs_find_lseg(lo, iomode); +- if (lseg) +- goto out_unlock; + + /* if LAYOUTGET already failed once we don't try again */ + if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) + goto out_unlock; + ++ /* Check to see if the layout for the given range already exists */ ++ lseg = pnfs_find_lseg(lo, &arg); ++ if (lseg) ++ goto out_unlock; ++ + if (pnfs_layoutgets_blocked(lo, NULL, 0)) + goto out_unlock; + atomic_inc(&lo->plh_outstanding); + + get_layout_hdr(lo); +- if (list_empty(&lo->plh_segs)) { ++ if (list_empty(&lo->plh_segs)) ++ first = true; ++ spin_unlock(&ino->i_lock); ++ if (first) { + /* The lo must be on the clp list if there is any + * chance of a CB_LAYOUTRECALL(FILE) coming in. + */ +@@ -738,24 +1048,18 @@ pnfs_update_layout(struct inode *ino, + list_add_tail(&lo->plh_layouts, &clp->cl_layouts); + spin_unlock(&clp->cl_lock); + } +- spin_unlock(&ino->i_lock); + +- lseg = send_layoutget(lo, ctx, iomode); +- if (!lseg) { +- spin_lock(&ino->i_lock); +- if (list_empty(&lo->plh_segs)) { +- spin_lock(&clp->cl_lock); +- list_del_init(&lo->plh_layouts); +- spin_unlock(&clp->cl_lock); +- clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); +- } +- spin_unlock(&ino->i_lock); ++ lseg = send_layoutget(lo, ctx, &arg); ++ if (!lseg && first) { ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->plh_layouts); ++ spin_unlock(&clp->cl_lock); + } + atomic_dec(&lo->plh_outstanding); + put_layout_hdr(lo); + out: + dprintk("%s end, state 0x%lx lseg %p\n", __func__, +- nfsi->layout->plh_flags, lseg); ++ nfsi->layout->plh_flags ? nfsi->layout->plh_flags : -1, lseg); + return lseg; + out_unlock: + spin_unlock(&ino->i_lock); +@@ -772,17 +1076,6 @@ pnfs_layout_process(struct nfs4_layoutge + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + int status = 0; + +- /* Verify we got what we asked for. +- * Note that because the xdr parsing only accepts a single +- * element array, this can fail even if the server is behaving +- * correctly. +- */ +- if (lgp->args.range.iomode > res->range.iomode || +- res->range.offset != 0 || +- res->range.length != NFS4_MAX_UINT64) { +- status = -EINVAL; +- goto out; +- } + /* Inject layout blob into I/O device driver */ + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); + if (!lseg || IS_ERR(lseg)) { +@@ -808,7 +1101,7 @@ pnfs_layout_process(struct nfs4_layoutge + } + init_lseg(lo, lseg); + lseg->pls_range = res->range; +- *lgp->lsegpp = lseg; ++ *lgp->lsegpp = get_lseg(lseg); + pnfs_insert_layout(lo, lseg); + + if (res->return_on_close) { +@@ -829,6 +1122,523 @@ out_forget_reply: + goto out; + } + ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ lo = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!ld || !lo) ++ return; ++ ++ pgio->pg_test = ld->pg_test; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ readahead_range(inode, pages, &loff, &count); ++ pgio->pg_lseg = pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ); ++ if (pgio->pg_lseg) { ++ pnfs_set_pg_test(inode, pgio); ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) ++ pgio->pg_test = NULL; ++ else { ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++ } ++} ++ ++/* Set buffer size for data servers */ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = 0; ++ ++ if (server->pnfs_curr_ld && server->pnfs_curr_ld->get_blocksize) ++ dssize = server->pnfs_curr_ld->get_blocksize(); ++ if (dssize) ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++EXPORT_SYMBOL_GPL(pnfs_writeback_done); ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++EXPORT_SYMBOL_GPL(pnfs_read_done); ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++EXPORT_SYMBOL_GPL(pnfs_commit_done); ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ if (!pnfs_use_rpc(nfss)) ++ data->pdata.pnfsflags |= PNFS_NO_RPC; ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct inode *inode, ++ struct nfs4_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->cleanup_layoutcommit( ++ NFS_I(inode)->layout, data); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_setup_layoutcommit(struct inode *inode, ++ struct nfs4_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.range.iomode = IOMODE_RW; ++ data->args.range.offset = write_begin_pos; ++ data->args.range.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct nfs4_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = kzalloc(sizeof(*data), GFP_NOFS); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->write_begin_pos; ++ write_end_pos = nfsi->layout->write_end_pos; ++ data->cred = nfsi->layout->cred; ++ nfsi->layout->write_begin_pos = 0; ++ nfsi->layout->write_end_pos = 0; ++ nfsi->layout->cred = NULL; ++ __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags); ++ memcpy(data->args.stateid.data, nfsi->layout->plh_stateid.data, ++ NFS4_STATEID_SIZE); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout_hdr(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_setup_layoutcommit(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout_hdr(NFS_I(inode)->layout); ++ goto out_free; ++ } ++ status = nfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ kfree(data); ++ goto out; ++} ++ ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ /* lseg refcounting handled directly in nfs_write_end */ ++ kfree(fsdata); ++} ++ + /* + * Device ID cache. Currently supports one layout type per struct nfs_client. + * Add layout type to the lookup key to expand to support multiple types. +@@ -861,6 +1671,25 @@ pnfs_alloc_init_deviceid_cache(struct nf + } + EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); + ++/* Must be called with locked c->dc_lock */ ++static struct pnfs_deviceid_node * ++pnfs_unhash_deviceid(struct pnfs_deviceid_cache *c, ++ struct nfs4_deviceid *id) ++{ ++ struct pnfs_deviceid_node *d; ++ struct hlist_node *n; ++ long h = nfs4_deviceid_hash(id); ++ ++ dprintk("%s hash %ld\n", __func__, h); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) ++ if (!memcmp(&d->de_id, id, sizeof(*id))) { ++ hlist_del_rcu(&d->de_node); ++ return d; ++ } ++ ++ return NULL; ++} ++ + /* + * Called from pnfs_layoutdriver_type->free_lseg + * last layout segment reference frees deviceid +@@ -869,29 +1698,33 @@ void + pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid) + { +- struct nfs4_deviceid *id = &devid->de_id; +- struct pnfs_deviceid_node *d; +- struct hlist_node *n; +- long h = nfs4_deviceid_hash(id); +- + dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); + if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) + return; + +- hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) +- if (!memcmp(&d->de_id, id, sizeof(*id))) { +- hlist_del_rcu(&d->de_node); +- spin_unlock(&c->dc_lock); +- synchronize_rcu(); +- c->dc_free_callback(devid); +- return; +- } ++ pnfs_unhash_deviceid(c, &devid->de_id); + spin_unlock(&c->dc_lock); +- /* Why wasn't it found in the list? */ +- BUG(); ++ synchronize_rcu(); ++ c->dc_free_callback(devid); + } + EXPORT_SYMBOL_GPL(pnfs_put_deviceid); + ++void ++pnfs_delete_deviceid(struct pnfs_deviceid_cache *c, ++ struct nfs4_deviceid *id) ++{ ++ struct pnfs_deviceid_node *devid; ++ ++ spin_lock(&c->dc_lock); ++ devid = pnfs_unhash_deviceid(c, id); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); ++ if (atomic_dec_and_test(&devid->de_ref)) ++ c->dc_free_callback(devid); ++} ++EXPORT_SYMBOL_GPL(pnfs_delete_deviceid); ++ + /* Find and reference a deviceid */ + struct pnfs_deviceid_node * + pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) +diff -up linux-2.6.38.noarch/fs/nfs/pnfs.h.orig linux-2.6.38.noarch/fs/nfs/pnfs.h +--- linux-2.6.38.noarch/fs/nfs/pnfs.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/pnfs.h 2011-03-26 07:57:44.272821329 -0400 +@@ -30,6 +30,9 @@ + #ifndef FS_NFS_PNFS_H + #define FS_NFS_PNFS_H + ++#include ++#include "callback.h" ++ + enum { + NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ + NFS_LSEG_ROC, /* roc bit received from server */ +@@ -43,6 +46,17 @@ struct pnfs_layout_segment { + struct pnfs_layout_hdr *pls_layout; + }; + ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++struct pnfs_fsdata { ++ struct pnfs_layout_segment *lseg; ++ int bypass_eof; ++ void *private; ++}; ++ + #ifdef CONFIG_NFS_V4_1 + + #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" +@@ -51,20 +65,87 @@ enum { + NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ + NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ + NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ ++ NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */ + NFS_LAYOUT_ROC, /* some lseg had roc bit set */ + NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ + }; + ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 1, ++}; ++ + /* Per-layout driver specific registration structure */ + struct pnfs_layoutdriver_type { + struct list_head pnfs_tblid; + const u32 id; + const char *name; + struct module *owner; +- int (*set_layoutdriver) (struct nfs_server *); ++ unsigned flags; ++ int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); + int (*clear_layoutdriver) (struct nfs_server *); ++ ++ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode); ++ void (*free_layout_hdr) (struct pnfs_layout_hdr *); ++ + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); + void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++ ++/* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args); ++ ++ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args); ++ ++ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_data *data); ++ ++ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args); + }; + + struct pnfs_layout_hdr { +@@ -72,11 +153,18 @@ struct pnfs_layout_hdr { + struct list_head plh_layouts; /* other client layouts */ + struct list_head plh_bulk_recall; /* clnt list of bulk recalls */ + struct list_head plh_segs; /* layout segments list */ ++ int roc_iomode;/* return on close iomode, 0=none */ + nfs4_stateid plh_stateid; + atomic_t plh_outstanding; /* number of RPCs out */ + unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ + u32 plh_barrier; /* ignore lower seqids */ + unsigned long plh_flags; ++ struct rpc_cred *cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t write_begin_pos; ++ loff_t write_end_pos; + struct inode *plh_inode; + }; + +@@ -90,6 +178,14 @@ struct pnfs_device { + unsigned int pglen; + }; + ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ + /* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +@@ -135,22 +231,52 @@ extern struct pnfs_deviceid_node *pnfs_a + struct pnfs_deviceid_node *); + extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid); ++extern void pnfs_delete_deviceid(struct pnfs_deviceid_cache *, ++ struct nfs4_deviceid *); + + extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); + extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + + /* nfs4proc.c */ ++extern int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); + extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev); + extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); ++extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, ++ int issync); ++extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); + + /* pnfs.c */ + void get_layout_hdr(struct pnfs_layout_hdr *lo); ++void put_lseg(struct pnfs_layout_segment *lseg); ++bool should_free_lseg(struct pnfs_layout_range *lseg_range, ++ struct pnfs_layout_range *recall_range); + struct pnfs_layout_segment * + pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, +- enum pnfs_iomode access_type); +-void set_pnfs_layoutdriver(struct nfs_server *, u32 id); ++ loff_t pos, u64 count, enum pnfs_iomode access_type); ++int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); + void unset_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++void pnfs_cleanup_layoutcommit(struct inode *, ++ struct nfs4_layoutcommit_data *); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); + int pnfs_layout_process(struct nfs4_layoutget *lgp); + void pnfs_free_lseg_list(struct list_head *tmp_list); + void pnfs_destroy_layout(struct nfs_inode *); +@@ -162,14 +288,26 @@ void pnfs_set_layout_stateid(struct pnfs + int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, + struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state); ++void pnfs_read_done(struct nfs_read_data *); ++void pnfs_writeback_done(struct nfs_write_data *); ++void pnfs_commit_done(struct nfs_write_data *); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); + int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, +- u32 iomode); ++ struct pnfs_layout_range *recall_range); + bool pnfs_roc(struct inode *ino); + void pnfs_roc_release(struct inode *ino); + void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); + bool pnfs_roc_drain(struct inode *ino, u32 *barrier); + ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} + + static inline int lo_fail_bit(u32 iomode) + { +@@ -177,12 +315,141 @@ static inline int lo_fail_bit(u32 iomode + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; + } + ++static inline struct pnfs_layout_segment * ++get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ if (lseg) { ++ atomic_inc(&lseg->pls_refcount); ++ smp_mb__after_atomic_inc(); ++ } ++ return lseg; ++} ++ + /* Return true if a layout driver is being used for this mountpoint */ + static inline int pnfs_enabled_sb(struct nfs_server *nfss) + { + return nfss->pnfs_curr_ld != NULL; + } + ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++static inline bool pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return nfss->pnfs_curr_ld->flags & PNFS_USE_RPC_CODE; ++ ++ return true; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && nfss->pnfs_curr_ld->write_begin) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++/* CAREFUL - what happens if copied < len??? */ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end) ++ return nfss->pnfs_curr_ld->write_end(inode, page, pos, len, ++ copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (fsdata && nfss->pnfs_curr_ld) { ++ if (nfss->pnfs_curr_ld->write_end_cleanup) ++ nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata); ++ if (nfss->pnfs_curr_ld->write_begin) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && has_layout(nfsi)) ++ return _pnfs_return_layout(ino, range, wait); ++ ++ return 0; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags); ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ return (struct pnfs_layout_segment *)fsdata; ++ } ++ return NULL; ++} ++ + #else /* CONFIG_NFS_V4_1 */ + + static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +@@ -194,18 +461,66 @@ static inline void pnfs_destroy_layout(s + } + + static inline struct pnfs_layout_segment * ++get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ return NULL; ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline struct pnfs_layout_segment * + pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, +- enum pnfs_iomode access_type) ++ loff_t pos, u64 count, enum pnfs_iomode access_type) + { + return NULL; + } + + static inline bool +-pnfs_roc(struct inode *ino) ++has_layout(struct nfs_inode *nfsi) + { + return false; + } + ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; ++} ++ + static inline void + pnfs_roc_release(struct inode *ino) + { +@@ -222,7 +537,37 @@ pnfs_roc_drain(struct inode *ino, u32 *b + return false; + } + +-static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) ++static inline bool ++pnfs_roc(struct inode *ino) ++{ ++ return false; ++} ++ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ return false; ++} ++ ++static inline bool pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return true; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id) + { + } + +@@ -230,6 +575,62 @@ static inline void unset_pnfs_layoutdriv + { + } + ++static inline void pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ server->ds_wsize = server->ds_rsize = -1; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino, ++ struct nfs_open_context *ctx, struct list_head *pages, ++ size_t *rsize) ++{ ++ pgio->pg_lseg = NULL; ++} ++ ++static inline void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino, ++ size_t *wsize) ++{ ++ pgio->pg_lseg = NULL; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + #endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.38.noarch/fs/nfs/read.c.orig linux-2.6.38.noarch/fs/nfs/read.c +--- linux-2.6.38.noarch/fs/nfs/read.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/read.c 2011-03-26 07:57:44.273821320 -0400 +@@ -18,14 +18,17 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" + #include "iostat.h" + #include "fscache.h" +-#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -117,12 +120,16 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ loff_t pgoffs; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- pnfs_update_layout(inode, ctx, IOMODE_READ); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pgoffs = (loff_t)page->index << PAGE_CACHE_SHIFT; ++ lseg = pnfs_update_layout(inode, ctx, pgoffs, len, IOMODE_READ); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -197,21 +237,7 @@ static int nfs_read_rpcsetup(struct nfs_ + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -355,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -369,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -410,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode), ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -569,7 +611,20 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ if (desc->pgio->pg_lseg) { ++ loff_t pgoff = (loff_t)page->index << PAGE_CACHE_SHIFT; ++ struct pnfs_layout_range *range = &desc->pgio->pg_lseg->pls_range; ++ ++ /* retry later with the right lseg? */ ++ if (range->offset > pgoff + len || ++ range->offset + range->length < pgoff) { ++ new = ERR_PTR(-EAGAIN); ++ goto out_error; ++ } ++ } ++ ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -625,7 +680,7 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + +- pnfs_update_layout(inode, desc.ctx, IOMODE_READ); ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -634,6 +689,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.38.noarch/fs/nfs/super.c.orig linux-2.6.38.noarch/fs/nfs/super.c +--- linux-2.6.38.noarch/fs/nfs/super.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/super.c 2011-03-26 07:57:44.275821302 -0400 +@@ -63,6 +63,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -725,6 +726,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -763,6 +786,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.38.noarch/fs/nfs/unlink.c.orig linux-2.6.38.noarch/fs/nfs/unlink.c +--- linux-2.6.38.noarch/fs/nfs/unlink.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/unlink.c 2011-03-26 07:57:44.276821293 -0400 +@@ -113,7 +113,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -184,19 +184,17 @@ static int nfs_do_call_unlink(struct den + return 1; + } + +-static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) ++static int nfs_call_unlink(struct dentry *parent, struct dentry *dentry, struct nfs_unlinkdata *data) + { +- struct dentry *parent; + struct inode *dir; + int ret = 0; + + +- parent = dget_parent(dentry); + if (parent == NULL) +- goto out_free; ++ goto out; + dir = parent->d_inode; + if (nfs_copy_dname(dentry, data) != 0) +- goto out_dput; ++ goto out; + /* Non-exclusive lock protects against concurrent lookup() calls */ + spin_lock(&dir->i_lock); + if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { +@@ -204,13 +202,11 @@ static int nfs_call_unlink(struct dentry + hlist_add_head(&data->list, &NFS_I(dir)->silly_list); + spin_unlock(&dir->i_lock); + ret = 1; +- goto out_dput; ++ goto out; + } + spin_unlock(&dir->i_lock); + ret = nfs_do_call_unlink(parent, dir, data); +-out_dput: +- dput(parent); +-out_free: ++out: + return ret; + } + +@@ -283,26 +279,24 @@ out: + + /** + * nfs_complete_unlink - Initialize completion of the sillydelete ++ * @parent: parent directory + * @dentry: dentry to delete +- * @inode: inode + * + * Since we're most likely to be called by dentry_iput(), we + * only use the dentry to find the sillydelete. We then copy the name + * into the qstr. + */ + void +-nfs_complete_unlink(struct dentry *dentry, struct inode *inode) ++nfs_complete_unlink(struct dentry *parent, struct dentry *dentry) + { + struct nfs_unlinkdata *data = NULL; + +- spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + data = dentry->d_fsdata; + } +- spin_unlock(&dentry->d_lock); + +- if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) ++ if (data != NULL && !nfs_call_unlink(parent, dentry, data)) + nfs_free_unlinkdata(data); + } + +@@ -388,7 +382,7 @@ static void nfs_rename_prepare(struct rp + struct nfs_renamedata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->old_dir); + +- if (nfs4_setup_sequence(server, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.38.noarch/fs/nfs/write.c.orig linux-2.6.38.noarch/fs/nfs/write.c +--- linux-2.6.38.noarch/fs/nfs/write.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/fs/nfs/write.c 2011-03-26 07:57:44.277821284 -0400 +@@ -28,6 +28,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -58,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -426,6 +428,17 @@ static void nfs_inode_remove_request(str + spin_unlock(&inode->i_lock); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -531,7 +544,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -539,7 +552,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -568,7 +582,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -593,8 +608,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -642,16 +657,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -664,23 +680,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -699,7 +719,8 @@ int nfs_flush_incompatible(struct file * + return 0; + do_flush = req->wb_page != page || req->wb_context != ctx || + req->wb_lock_context->lockowner != current->files || +- req->wb_lock_context->pid != current->tgid; ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -726,7 +747,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -751,7 +773,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -781,25 +803,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -810,12 +828,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -836,30 +904,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -870,6 +915,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -982,6 +1028,8 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1047,13 +1095,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode), ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1137,10 +1199,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1153,6 +1216,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1169,7 +1239,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1195,6 +1265,9 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ + nfs_restart_rpc(task, server->nfs_client); + return -EAGAIN; + } +@@ -1239,40 +1312,75 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (how & FLUSH_SYNC) ++ rpc_wait_for_completion_task(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1283,47 +1391,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- if (how & FLUSH_SYNC) +- rpc_wait_for_completion_task(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1343,6 +1451,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1360,6 +1481,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1376,12 +1502,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1397,21 +1523,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1466,7 +1593,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +diff -up linux-2.6.38.noarch/include/linux/dcache.h.orig linux-2.6.38.noarch/include/linux/dcache.h +--- linux-2.6.38.noarch/include/linux/dcache.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/dcache.h 2011-03-26 07:57:44.307821030 -0400 +@@ -169,6 +169,7 @@ struct dentry_operations { + char *(*d_dname)(struct dentry *, char *, int); + struct vfsmount *(*d_automount)(struct path *); + int (*d_manage)(struct dentry *, bool, bool); ++ void (*d_unlink)(struct dentry *, struct dentry *); + } ____cacheline_aligned; + + /* +diff -up linux-2.6.38.noarch/include/linux/exportfs.h.orig linux-2.6.38.noarch/include/linux/exportfs.h +--- linux-2.6.38.noarch/include/linux/exportfs.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/exportfs.h 2011-03-26 07:57:44.309821012 -0400 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -188,4 +189,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.38.noarch/include/linux/exp_xdr.h.orig linux-2.6.38.noarch/include/linux/exp_xdr.h +--- linux-2.6.38.noarch/include/linux/exp_xdr.h.orig 2011-03-26 07:57:44.308821021 -0400 ++++ linux-2.6.38.noarch/include/linux/exp_xdr.h 2011-03-26 07:57:44.308821021 -0400 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.38.noarch/include/linux/fs.h.orig linux-2.6.38.noarch/include/linux/fs.h +--- linux-2.6.38.noarch/include/linux/fs.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/fs.h 2011-03-26 07:57:44.311820996 -0400 +@@ -399,6 +399,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1368,6 +1369,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.38.noarch/include/linux/nfs4.h.orig linux-2.6.38.noarch/include/linux/nfs4.h +--- linux-2.6.38.noarch/include/linux/nfs4.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfs4.h 2011-03-26 07:57:44.313820980 -0400 +@@ -17,6 +17,7 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 ++#define NFS4_CLIENTID_SIZE 8 + #define NFS4_STATEID_SEQID_SIZE 4 + #define NFS4_STATEID_OTHER_SIZE 12 + #define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) +@@ -131,6 +132,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070103 + #define EXCHGID4_FLAG_MASK_R 0x80070103 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -181,7 +189,13 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; + + struct nfs41_stateid { + __be32 seqid; +@@ -559,7 +573,12 @@ enum { + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, + NFSPROC4_CLNT_LAYOUTGET, ++ NFSPROC4_CLNT_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_LAYOUTRETURN, ++ NFSPROC4_CLNT_GETDEVICELIST, + NFSPROC4_CLNT_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -582,6 +601,8 @@ enum pnfs_layouttype { + LAYOUT_NFSV4_1_FILES = 1, + LAYOUT_OSD2_OBJECTS = 2, + LAYOUT_BLOCK_VOLUME = 3, ++ ++ NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000 + }; + + /* used for both layout return and recall */ +diff -up linux-2.6.38.noarch/include/linux/nfsd4_block.h.orig linux-2.6.38.noarch/include/linux/nfsd4_block.h +--- linux-2.6.38.noarch/include/linux/nfsd4_block.h.orig 2011-03-26 07:57:44.326820870 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd4_block.h 2011-03-26 07:57:44.326820870 -0400 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.38.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.38.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.38.noarch/include/linux/nfsd4_spnfs.h.orig 2011-03-26 07:57:44.328820852 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd4_spnfs.h 2011-03-26 07:57:44.328820852 -0400 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.38.noarch/include/linux/nfsd/const.h.orig linux-2.6.38.noarch/include/linux/nfsd/const.h +--- linux-2.6.38.noarch/include/linux/nfsd/const.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd/const.h 2011-03-26 07:57:44.321820912 -0400 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.38.noarch/include/linux/nfsd/debug.h.orig linux-2.6.38.noarch/include/linux/nfsd/debug.h +--- linux-2.6.38.noarch/include/linux/nfsd/debug.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd/debug.h 2011-03-26 07:57:44.322820904 -0400 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.38.noarch/include/linux/nfsd/export.h.orig linux-2.6.38.noarch/include/linux/nfsd/export.h +--- linux-2.6.38.noarch/include/linux/nfsd/export.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd/export.h 2011-03-26 07:57:44.322820904 -0400 +@@ -79,6 +79,20 @@ struct nfsd4_fs_locations { + }; + + /* ++ * Callbacks ++ */ ++struct nfsd4_callback { ++ void *cb_op; ++ struct nfs4_client *cb_clp; ++ struct list_head cb_per_client; ++ u32 cb_minorversion; ++ struct rpc_message cb_msg; ++ const struct rpc_call_ops *cb_ops; ++ struct work_struct cb_work; ++ bool cb_done; ++}; ++ ++/* + * We keep an array of pseudoflavors with the export, in order from most + * to least preferred. For the forseeable future, we don't expect more + * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3, +@@ -100,6 +114,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.38.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.38.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.38.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2011-03-26 07:57:44.323820896 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd/nfs4layoutxdr.h 2011-03-26 07:57:44.323820896 -0400 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.38.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.38.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.38.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2011-03-26 07:57:44.323820896 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2011-03-26 07:57:44.323820896 -0400 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.38.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.38.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.38.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2011-03-26 07:57:44.324820888 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd/nfsd4_pnfs.h 2011-03-26 07:57:44.324820888 -0400 +@@ -0,0 +1,273 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ /* nfsd internal */ ++ struct nfsd4_callback clr_recall; ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.38.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.38.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.38.noarch/include/linux/nfsd/syscall.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfsd/syscall.h 2011-03-26 07:57:44.325820879 -0400 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.38.noarch/include/linux/nfs_fs.h.orig linux-2.6.38.noarch/include/linux/nfs_fs.h +--- linux-2.6.38.noarch/include/linux/nfs_fs.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfs_fs.h 2011-03-26 07:57:44.315820964 -0400 +@@ -488,7 +488,7 @@ extern void nfs_release_automount_timer( + /* + * linux/fs/nfs/unlink.c + */ +-extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); ++extern void nfs_complete_unlink(struct dentry *dentry, struct dentry *); + extern void nfs_block_sillyrename(struct dentry *dentry); + extern void nfs_unblock_sillyrename(struct dentry *dentry); + extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); +@@ -499,8 +499,12 @@ extern int nfs_sillyrename(struct inode + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +diff -up linux-2.6.38.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.38.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.38.noarch/include/linux/nfs_fs_sb.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfs_fs_sb.h 2011-03-26 07:57:44.316820955 -0400 +@@ -30,6 +30,8 @@ struct nfs_client { + #define NFS_CS_CALLBACK 1 /* - callback started */ + #define NFS_CS_IDMAP 2 /* - idmap started */ + #define NFS_CS_RENEWD 3 /* - renewd started */ ++#define NFS_CS_STOP_RENEW 4 /* no more state to renew */ ++#define NFS_CS_CHECK_LEASE_TIME 5 /* need to check lease time */ + struct sockaddr_storage cl_addr; /* server identifier */ + size_t cl_addrlen; + char * cl_hostname; /* hostname of server */ +@@ -83,6 +85,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -128,7 +140,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -140,7 +152,11 @@ struct nfs_server { + that are supported on this + filesystem */ + struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ +- struct rpc_wait_queue roc_rpcwaitq; ++ struct rpc_wait_queue roc_rpcwaitq; ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++ u32 pnfs_blksize; /* layout_blksize attr */ + + /* the following fields are protected by nfs_client->cl_lock */ + struct rb_root state_owners; +diff -up linux-2.6.38.noarch/include/linux/nfs_iostat.h.orig linux-2.6.38.noarch/include/linux/nfs_iostat.h +--- linux-2.6.38.noarch/include/linux/nfs_iostat.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfs_iostat.h 2011-03-26 07:57:44.318820937 -0400 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.38.noarch/include/linux/nfs_page.h.orig linux-2.6.38.noarch/include/linux/nfs_page.h +--- linux-2.6.38.noarch/include/linux/nfs_page.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfs_page.h 2011-03-26 07:57:44.319820928 -0400 +@@ -49,6 +49,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -62,6 +63,11 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -70,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.38.noarch/include/linux/nfs_xdr.h.orig linux-2.6.38.noarch/include/linux/nfs_xdr.h +--- linux-2.6.38.noarch/include/linux/nfs_xdr.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/nfs_xdr.h 2011-03-26 07:57:44.321820912 -0400 +@@ -3,6 +3,9 @@ + + #include + #include ++#include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +13,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -115,6 +118,7 @@ struct nfs_fsinfo { + struct timespec time_delta; /* server time granularity */ + __u32 lease_time; /* in seconds */ + __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ + }; + + struct nfs_fsstat { +@@ -226,6 +230,73 @@ struct nfs4_layoutget { + struct pnfs_layout_segment **lsegpp; + }; + ++struct nfs4_layoutcommit_args { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct pnfs_layout_range range; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++ int status; ++}; ++ ++struct nfs4_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct nfs4_layoutcommit_args args; ++ struct nfs4_layoutcommit_res res; ++}; ++ ++struct nfs4_layoutreturn_args { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct pnfs_layout_range range; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_layoutreturn { ++ struct nfs4_layoutreturn_args args; ++ struct nfs4_layoutreturn_res res; ++ struct rpc_cred *cred; ++ struct nfs_client *clp; ++ int rpc_status; ++}; ++ ++struct nfs4_getdevicelist_args { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ + struct nfs4_getdeviceinfo_args { + struct pnfs_device *pdev; + struct nfs4_sequence_args seq_args; +@@ -889,7 +960,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -1004,6 +1075,30 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++ ++/* pnfsflag values */ ++enum pnfs_flags { ++ PNFS_NO_RPC = 1 << 0, /* non rpc result callback switch */ ++}; ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -1019,10 +1114,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -1038,6 +1139,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +diff -up linux-2.6.38.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.38.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.38.noarch/include/linux/panfs_shim_api.h.orig 2011-03-26 07:57:44.329820843 -0400 ++++ linux-2.6.38.noarch/include/linux/panfs_shim_api.h 2011-03-26 07:57:44.329820843 -0400 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.38.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.38.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.38.noarch/include/linux/pnfs_osd_xdr.h.orig 2011-03-26 07:57:44.331820825 -0400 ++++ linux-2.6.38.noarch/include/linux/pnfs_osd_xdr.h 2011-03-26 07:57:44.331820825 -0400 +@@ -0,0 +1,439 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct nfs4_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.38.noarch/include/linux/posix_acl.h.orig linux-2.6.38.noarch/include/linux/posix_acl.h +--- linux-2.6.38.noarch/include/linux/posix_acl.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/posix_acl.h 2011-03-26 07:57:44.332820817 -0400 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.38.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.38.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.38.noarch/include/linux/sunrpc/msg_prot.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/sunrpc/msg_prot.h 2011-03-26 07:57:44.332820817 -0400 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.38.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.38.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.38.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2011-03-26 07:57:44.333820809 -0400 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.38.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.38.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.38.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2011-03-26 07:57:44.334820801 -0400 ++++ linux-2.6.38.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2011-03-26 07:57:44.334820801 -0400 +@@ -0,0 +1,105 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.38.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.38.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.38.noarch/include/linux/sunrpc/svc_xprt.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/sunrpc/svc_xprt.h 2011-03-26 07:57:44.334820801 -0400 +@@ -204,4 +204,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.38.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.38.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.38.noarch/include/linux/sunrpc/xdr.h.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/include/linux/sunrpc/xdr.h 2011-03-26 07:57:44.335820793 -0400 +@@ -213,6 +213,7 @@ typedef int (*kxdrdproc_t)(void *rqstp, + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.38.noarch/net/sunrpc/Makefile.orig linux-2.6.38.noarch/net/sunrpc/Makefile +--- linux-2.6.38.noarch/net/sunrpc/Makefile.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/net/sunrpc/Makefile 2011-03-26 07:57:44.336820785 -0400 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.38.noarch/net/sunrpc/sched.c.orig linux-2.6.38.noarch/net/sunrpc/sched.c +--- linux-2.6.38.noarch/net/sunrpc/sched.c.orig 2011-03-26 07:53:04.357196210 -0400 ++++ linux-2.6.38.noarch/net/sunrpc/sched.c 2011-03-26 07:57:44.337820776 -0400 +@@ -787,11 +787,11 @@ EXPORT_SYMBOL_GPL(rpc_free); + /* + * Creation and deletion of RPC task structures + */ +-static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data) ++static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data, unsigned short extra_flags) + { + memset(task, 0, sizeof(*task)); + atomic_set(&task->tk_count, 1); +- task->tk_flags = task_setup_data->flags; ++ task->tk_flags = task_setup_data->flags | extra_flags; + task->tk_ops = task_setup_data->callback_ops; + task->tk_calldata = task_setup_data->callback_data; + INIT_LIST_HEAD(&task->tk_task); +@@ -840,14 +840,14 @@ struct rpc_task *rpc_new_task(const stru + flags = RPC_TASK_DYNAMIC; + } + +- rpc_init_task(task, setup_data); ++ rpc_init_task(task, setup_data, flags); ++ + if (task->tk_status < 0) { + int err = task->tk_status; + rpc_put_task(task); + return ERR_PTR(err); + } + +- task->tk_flags |= flags; + dprintk("RPC: allocated task %p\n", task); + return task; + } +diff -up linux-2.6.38.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.38.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.38.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2011-03-26 07:57:44.338820767 -0400 ++++ linux-2.6.38.noarch/net/sunrpc/simple_rpc_pipefs.c 2011-03-26 07:57:44.338820767 -0400 +@@ -0,0 +1,423 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.38.noarch/net/sunrpc/xdr.c.orig linux-2.6.38.noarch/net/sunrpc/xdr.c +--- linux-2.6.38.noarch/net/sunrpc/xdr.c.orig 2011-03-14 21:20:32.000000000 -0400 ++++ linux-2.6.38.noarch/net/sunrpc/xdr.c 2011-03-26 07:57:44.338820767 -0400 +@@ -518,6 +518,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages